From f73ad1c2e27d2418e428eb80d5047c5bf2b2770d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Dec 2011 14:13:55 -0500 Subject: [PATCH 2/6] Bugfix/Rewrite: Algorithm to determine adaptor boundaries The algorithm wasn't accounting for the case where the read is the reverse strand and the insert size is negative. * Fixed and rewrote for more clarity (with Ryan, Mark and Eric). * Restructured the code to handle GATKSAMRecords only * Cleaned up the other structures and functions around it to minimize clutter and potential for error. * Added unit tests for all 4 cases of adaptor boundaries. --- .../gatk/iterators/LocusIteratorByState.java | 7 +- .../sting/utils/sam/ReadUtils.java | 269 +++++++----------- .../utils/{ => sam}/ReadUtilsUnitTest.java | 47 ++- 3 files changed, 155 insertions(+), 168 deletions(-) rename public/java/test/org/broadinstitute/sting/utils/{ => sam}/ReadUtilsUnitTest.java (66%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index ee3ea63eb..75e787e05 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.ReservoirDownsampler; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -432,7 +431,7 @@ public class LocusIteratorByState extends LocusIterator { while(iterator.hasNext()) { SAMRecordState state = iterator.next(); if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { - if ( filterBaseInRead(state.getRead(), location.getStart()) ) { + if ( filterBaseInRead((GATKSAMRecord) state.getRead(), location.getStart()) ) { //discarded_bases++; //printStatus("Adaptor bases", discarded_adaptor_bases); continue; @@ -481,8 +480,8 @@ public class LocusIteratorByState extends LocusIterator { * @param pos * @return */ - private static boolean filterBaseInRead(SAMRecord rec, long pos) { - return ReadUtils.readPairBaseOverlapType(rec, pos) == ReadUtils.OverlapType.IN_ADAPTOR; + private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { + return ReadUtils.isBaseInsideAdaptor(rec, pos); } private void updateReadStates() { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index a46a7f0bb..aa90ecf6f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -45,85 +45,13 @@ import java.util.*; public class ReadUtils { private ReadUtils() { } - // ---------------------------------------------------------------------------------------------------- - // - // Reduced read utilities - // - // ---------------------------------------------------------------------------------------------------- + private static int DEFAULT_ADAPTOR_SIZE = 100; - // ---------------------------------------------------------------------------------------------------- - // - // General utilities - // - // ---------------------------------------------------------------------------------------------------- - public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { - SAMFileHeader copy = new SAMFileHeader(); - - copy.setSortOrder(toCopy.getSortOrder()); - copy.setGroupOrder(toCopy.getGroupOrder()); - copy.setProgramRecords(toCopy.getProgramRecords()); - copy.setReadGroups(toCopy.getReadGroups()); - copy.setSequenceDictionary(toCopy.getSequenceDictionary()); - - for (Map.Entry e : toCopy.getAttributes()) - copy.setAttribute(e.getKey(), e.getValue()); - - return copy; + public enum ClippingTail { + LEFT_TAIL, + RIGHT_TAIL } - public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { - if (file.endsWith(".bam")) - return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); - return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); - } - - public static boolean isPlatformRead(SAMRecord read, String name) { - SAMReadGroupRecord readGroup = read.getReadGroup(); - if (readGroup != null) { - Object readPlatformAttr = readGroup.getAttribute("PL"); - if (readPlatformAttr != null) - return readPlatformAttr.toString().toUpperCase().contains(name); - } - return false; - } - - // --------------------------------------------------------------------------------------------------------- - // - // utilities for detecting overlapping reads - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Detects read pairs where the reads are so long relative to the over fragment size that they are - * reading into each other's adaptors. - * - * Normally, fragments are sufficiently far apart that reads aren't reading into each other. - * - * |--------------------> first read - * <--------------------| second read - * - * Sometimes, mostly due to lab errors or constraints, fragment library are made too short relative to the - * length of the reads. For example, it's possible to have 76bp PE reads with 125 bp inserts, so that ~25 bp of each - * read overlaps with its mate. - * - * |--------OOOOOOOOOOOO> first read - * first read - * [record in hand] - * s2 - * <-----------------------| - * - * s1, e1, and s2 are all in the record. From isize we can can compute e2 as s1 + isize + 1 - * - * s2 - * |-----------------------> - * s1 e1 - * <-----------------------| [record in hand] - * - * Here we cannot calculate e2 since the record carries s2 and e1 + isize is s2 now! - * - * This makes the following code a little nasty, since we can only detect if a base is in the adaptor, but not - * if it overlaps the read. - * - * @param read - * @param basePos - * @param adaptorLength - * @return - */ - public static OverlapType readPairBaseOverlapType(final SAMRecord read, long basePos, final int adaptorLength) { - OverlapType state = OverlapType.NOT_OVERLAPPING; + public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { + SAMFileHeader copy = new SAMFileHeader(); - Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); + copy.setSortOrder(toCopy.getSortOrder()); + copy.setGroupOrder(toCopy.getGroupOrder()); + copy.setProgramRecords(toCopy.getProgramRecords()); + copy.setReadGroups(toCopy.getReadGroups()); + copy.setSequenceDictionary(toCopy.getSequenceDictionary()); - if ( adaptorBoundaries != null ) { // we're not an unmapped pair -- cannot filter out + for (Map.Entry e : toCopy.getAttributes()) + copy.setAttribute(e.getKey(), e.getValue()); - boolean inAdapator = basePos >= adaptorBoundaries.first && basePos <= adaptorBoundaries.second; - - if ( inAdapator ) { - state = OverlapType.IN_ADAPTOR; - //System.out.printf("baseOverlapState: %50s negStrand=%b base=%d start=%d stop=%d, adaptorStart=%d adaptorEnd=%d isize=%d => %s%n", - // read.getReadName(), read.getReadNegativeStrandFlag(), basePos, read.getAlignmentStart(), read.getAlignmentEnd(), adaptorBoundaries.first, adaptorBoundaries.second, read.getInferredInsertSize(), state); - } - } - - return state; + return copy; } - private static Pair getAdaptorBoundaries(SAMRecord read, int adaptorLength) { - int isize = read.getInferredInsertSize(); - if ( isize == 0 ) - return null; // don't worry about unmapped pairs + public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { + if (file.endsWith(".bam")) + return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); + return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); + } - int adaptorStart, adaptorEnd; - - if ( read.getReadNegativeStrandFlag() ) { - // we are on the negative strand, so our mate is on the positive strand - int mateStart = read.getMateAlignmentStart(); - adaptorStart = mateStart - adaptorLength - 1; - adaptorEnd = mateStart - 1; - } else { - // we are on the positive strand, so our mate is on the negative strand - int mateEnd = read.getAlignmentStart() + isize - 1; - adaptorStart = mateEnd + 1; - adaptorEnd = mateEnd + adaptorLength; + public static boolean isPlatformRead(SAMRecord read, String name) { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if (readGroup != null) { + Object readPlatformAttr = readGroup.getAttribute("PL"); + if (readPlatformAttr != null) + return readPlatformAttr.toString().toUpperCase().contains(name); } + return false; + } - return new Pair(adaptorStart, adaptorEnd); + + /** + * is this base inside the adaptor of the read? + * + * There are two cases to treat here: + * + * 1) Read is in the negative strand => Adaptor boundary is on the left tail + * 2) Read is in the positive strand => Adaptor boundary is on the right tail + * + * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) + * + * @param read the read to test + * @param basePos base position in REFERENCE coordinates (not read coordinates) + * @return whether or not the base is in the adaptor + */ + public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { + Integer adaptorBoundary = getAdaptorBoundary(read); + if (adaptorBoundary == null || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) + return false; + + return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; } /** + * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to + * the read boundary. If the read is in the positive strand, this is the first base after the end of the + * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the + * beginning of the fragment. * - * @param read original SAM record - * @param adaptorLength length of adaptor sequence - * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped + * There are two cases we need to treat here: + * + * 1) Our read is in the reverse strand : + * + * <----------------------| * + * |---------------------> + * + * in these cases, the adaptor boundary is at the mate start (minus one) + * + * 2) Our read is in the forward strand : + * + * |----------------------> * + * <----------------------| + * + * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) + * + * @param read the read being tested for the adaptor boundary + * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the insert size cannot be determined (and is necessary for the calculation). */ - public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) { + protected static Integer getAdaptorBoundary(final SAMRecord read) { + if ( read.getReadUnmappedFlag() ) + return null; // don't worry about unmapped pairs - Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); - GATKSAMRecord result = read; + final int isize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) + int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) - if ( adaptorBoundaries != null ) { - if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() ) - result = hardClipStartOfRead(read, adaptorBoundaries.second); - else if ( !read.getReadNegativeStrandFlag() && adaptorBoundaries.first <= read.getAlignmentEnd() ) - result = hardClipEndOfRead(read, adaptorBoundaries.first); - } + if ( read.getReadNegativeStrandFlag() ) + adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header) + else if (isize > 0) + adaptorBoundary = read.getAlignmentStart() + isize + 1; // case 2 (see header) + else + return null; // this is a case 2 where for some reason we cannot estimate the insert size - return result; + return adaptorBoundary; } // return true if the read needs to be completely clipped @@ -557,7 +494,6 @@ public class ReadUtils { } - private static int DEFAULT_ADAPTOR_SIZE = 100; /** * @@ -565,12 +501,14 @@ public class ReadUtils { * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped */ public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read) { - return hardClipAdaptorSequence(read, DEFAULT_ADAPTOR_SIZE); + final Integer adaptorBoundary = getAdaptorBoundary(read); + + if (adaptorBoundary == null || !isInsideRead(read, adaptorBoundary)) + return read; + + return (read.getReadNegativeStrandFlag()) ? hardClipStartOfRead(read, adaptorBoundary) : hardClipEndOfRead(read, adaptorBoundary); } - public static OverlapType readPairBaseOverlapType(final SAMRecord read, long basePos) { - return readPairBaseOverlapType(read, basePos, DEFAULT_ADAPTOR_SIZE); - } public static boolean is454Read(SAMRecord read) { return isPlatformRead(read, "454"); @@ -716,18 +654,6 @@ public class ReadUtils { return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } - private static boolean readIsEntirelyInsertion(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() != CigarOperator.INSERTION) - return false; - } - return true; - } - - public enum ClippingTail { - LEFT_TAIL, - RIGHT_TAIL - } /** * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in @@ -866,7 +792,24 @@ public class ReadUtils { return comp.compare(read1, read2); } - // TEST UTILITIES + /** + * Is a base inside a read? + * + * @param read the read to evaluate + * @param referenceCoordinate the reference coordinate of the base to test + * @return true if it is inside the read, false otherwise. + */ + public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) { + return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); + } + + private static boolean readIsEntirelyInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java similarity index 66% rename from public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 630beaece..3878cbfa9 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.sam; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; @@ -69,4 +69,49 @@ public class ReadUtilsUnitTest extends BaseTest { Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); } + + @Test + public void testGetAdaptorBoundary() { + final byte [] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; + final byte [] quals = {30, 30, 30, 30, 30, 30, 30, 30}; + final String cigar = "8M"; + final int fragmentSize = 10; + final int mateStart = 1000; + final int BEFORE = mateStart - 2; + final int AFTER = mateStart + 2; + int myStart, boundary; + + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); + read.setMateAlignmentStart(mateStart); + read.setInferredInsertSize(fragmentSize); + + // Test case 1: positive strand, first read + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); + + // Test case 2: positive strand, second read + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); + + // Test case 3: negative strand, second read + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary, mateStart - 1); + + // Test case 4: negative strand, first read + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary, mateStart - 1); + + } } From 1c4774c475bb8f3e4a1a02336ea6836363f96c6c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 20 Dec 2011 14:30:06 -0500 Subject: [PATCH 3/6] Static versions of the hard clipping utilities For simplified access to the hard clipping utilities. No need to create a ReadClipper object if you are not doing multiple complicated clipping operations, just use the static methods. examples: ReadClipper.hardClipLowQualEnds(2); ReadClipper.hardClipAdaptorSequence(); --- .../sting/utils/clipping/ReadClipper.java | 114 +++++++++++++----- 1 file changed, 83 insertions(+), 31 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index f19a7a04f..d82472bae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -50,6 +50,44 @@ public class ReadClipper { return read; } + /** + * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. + * + * @param algorithm + * @return + */ + public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { + if (ops == null) + return getRead(); + else { + try { + GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone(); + for (ClippingOp op : getOps()) { + //check if the clipped read can still be clipped in the range requested + if (op.start < clippedRead.getReadLength()) { + ClippingOp fixedOperation = op; + if (op.stop >= clippedRead.getReadLength()) + fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); + + clippedRead = fixedOperation.apply(algorithm, clippedRead); + } + } + wasClipped = true; + ops.clear(); + if ( clippedRead.isEmpty() ) + return new GATKSAMRecord( clippedRead.getHeader() ); + return clippedRead; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); // this should never happen + } + } + } + + + + + // QUICK USE UTILITY FUNCTION + public GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) { return hardClipByReferenceCoordinates(-1, refStop); } @@ -163,39 +201,13 @@ public class ReadClipper { return clipRead(ClippingRepresentation.HARDCLIP_BASES); } + public GATKSAMRecord hardClipAdaptorSequence () { + final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read); + if (adaptorBoundary == null || !ReadUtils.isInsideRead(read, adaptorBoundary)) + return read; - /** - * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. - * - * @param algorithm - * @return - */ - public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { - if (ops == null) - return getRead(); - else { - try { - GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone(); - for (ClippingOp op : getOps()) { - //check if the clipped read can still be clipped in the range requested - if (op.start < clippedRead.getReadLength()) { - ClippingOp fixedOperation = op; - if (op.stop >= clippedRead.getReadLength()) - fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); - - clippedRead = fixedOperation.apply(algorithm, clippedRead); - } - } - wasClipped = true; - ops.clear(); - if ( clippedRead.isEmpty() ) - return new GATKSAMRecord( clippedRead.getHeader() ); - return clippedRead; - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // this should never happen - } - } + return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); } public GATKSAMRecord hardClipLeadingInsertions() { @@ -218,4 +230,44 @@ public class ReadClipper { this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); } + + + + // STATIC VERSIONS OF THE QUICK CLIPPING FUNCTIONS + + public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop); + } + + public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1); + } + + public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) { + return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop); + } + + public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) { + return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right); + } + + public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { + return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); + } + + public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipSoftClippedBases(); + } + + public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipAdaptorSequence(); + } + + public static GATKSAMRecord hardClipLeadingInsertions(GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipLeadingInsertions(); + } + + public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { + return (new ReadClipper(read)).revertSoftClippedBases(); + } } From 07128a2ad2cfe4d2b63a4df947ef7831573bbe1d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 20 Dec 2011 14:35:33 -0500 Subject: [PATCH 4/6] ReadUtils cleanup * Removed all clipping functionality from ReadUtils (it should all be done using the ReadClipper now) * Cleaned up functionality that wasn't being used or had been superseded by other code (in an effort to reduce multiple unsupported implementations) * Made all meaningful functions public and added better comments/explanation to the headers --- .../sting/gatk/walkers/ClipReadsWalker.java | 5 +- .../gatk/walkers/SplitSamFileWalker.java | 18 +- ...elGenotypeLikelihoodsCalculationModel.java | 3 +- .../gatk/walkers/indels/IndelRealigner.java | 2 +- .../indels/PairHMMIndelErrorModel.java | 3 +- .../sting/utils/sam/ReadUtils.java | 457 ++++-------------- 6 files changed, 112 insertions(+), 376 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index ff3867120..c28e205bf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -299,9 +299,8 @@ public class ClipReadsWalker extends ReadWalker readGroups = new ArrayList(); header.setReadGroups(readGroups); @@ -121,4 +121,20 @@ public class SplitSamFileWalker extends ReadWalker e : toCopy.getAttributes()) + copy.setAttribute(e.getKey(), e.getValue()); + + return copy; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 653a6f6e7..fe2086d47 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; @@ -125,7 +126,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; if(ReadUtils.is454Read(read)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index ba031c497..8f939fc49 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -463,7 +463,7 @@ public class IndelRealigner extends ReadWalker { private void emitReadLists() { // pre-merge lists to sort them in preparation for constrained SAMFileWriter readsNotToClean.addAll(readsToClean.getReads()); - ReadUtils.coordinateSortReads(readsNotToClean); + ReadUtils.sortReadsByCoordinate(readsNotToClean); manager.addReads(readsNotToClean, readsActuallyCleaned); readsToClean.clear(); readsNotToClean.clear(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 09968f47e..d893e620e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -32,6 +32,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -409,7 +410,7 @@ public class PairHMMIndelErrorModel { } else { //System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); - SAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); + SAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index aa90ecf6f..0c3433eba 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -47,11 +47,34 @@ public class ReadUtils { private static int DEFAULT_ADAPTOR_SIZE = 100; + /** + * A marker to tell which end of the read has been clipped + */ public enum ClippingTail { LEFT_TAIL, RIGHT_TAIL } + /** + * A HashMap of the SAM spec read flag names + * + * Note: This is not being used right now, but can be useful in the future + */ + private static final Map readFlagNames = new HashMap(); + static { + readFlagNames.put(0x1, "Paired"); + readFlagNames.put(0x2, "Proper"); + readFlagNames.put(0x4, "Unmapped"); + readFlagNames.put(0x8, "MateUnmapped"); + readFlagNames.put(0x10, "Forward"); + //readFlagNames.put(0x20, "MateForward"); + readFlagNames.put(0x40, "FirstOfPair"); + readFlagNames.put(0x80, "SecondOfPair"); + readFlagNames.put(0x100, "NotPrimary"); + readFlagNames.put(0x200, "NON-PF"); + readFlagNames.put(0x400, "Duplicate"); + } + /** * This enum represents all the different ways in which a read can overlap an interval. * @@ -96,38 +119,22 @@ public class ReadUtils { */ public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} - public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { - SAMFileHeader copy = new SAMFileHeader(); - - copy.setSortOrder(toCopy.getSortOrder()); - copy.setGroupOrder(toCopy.getGroupOrder()); - copy.setProgramRecords(toCopy.getProgramRecords()); - copy.setReadGroups(toCopy.getReadGroups()); - copy.setSequenceDictionary(toCopy.getSequenceDictionary()); - - for (Map.Entry e : toCopy.getAttributes()) - copy.setAttribute(e.getKey(), e.getValue()); - - return copy; - } - + /** + * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular + * SAMFileWriter without compression otherwise. + * + * @param header + * @param presorted + * @param file + * @param compression + * @return a SAMFileWriter with the compression level if it is a bam. + */ public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { if (file.endsWith(".bam")) return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); } - public static boolean isPlatformRead(SAMRecord read, String name) { - SAMReadGroupRecord readGroup = read.getReadGroup(); - if (readGroup != null) { - Object readPlatformAttr = readGroup.getAttribute("PL"); - if (readPlatformAttr != null) - return readPlatformAttr.toString().toUpperCase().contains(name); - } - return false; - } - - /** * is this base inside the adaptor of the read? * @@ -175,7 +182,7 @@ public class ReadUtils { * @param read the read being tested for the adaptor boundary * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the insert size cannot be determined (and is necessary for the calculation). */ - protected static Integer getAdaptorBoundary(final SAMRecord read) { + public static Integer getAdaptorBoundary(final SAMRecord read) { if ( read.getReadUnmappedFlag() ) return null; // don't worry about unmapped pairs @@ -192,363 +199,54 @@ public class ReadUtils { return adaptorBoundary; } - // return true if the read needs to be completely clipped - private static GATKSAMRecord hardClipStartOfRead(GATKSAMRecord oldRec, int stopPosition) { - - if ( stopPosition >= oldRec.getAlignmentEnd() ) { - // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it - //System.out.printf("Entire read needs to be clipped: %50s %n", read.getReadName()); - return null; - } - - GATKSAMRecord read; - try { - read = (GATKSAMRecord)oldRec.clone(); - } catch (Exception e) { - return null; - } - - //System.out.printf("Clipping start of read: %50s start=%d adaptorEnd=%d isize=%d %n", - // read.getReadName(), read.getAlignmentStart(), stopPosition, read.getInferredInsertSize()); - - Cigar oldCigar = read.getCigar(); - LinkedList newCigarElements = new LinkedList(); - int currentPos = read.getAlignmentStart(); - int basesToClip = 0; - int basesAlreadyClipped = 0; - - for ( CigarElement ce : oldCigar.getCigarElements() ) { - - if ( currentPos > stopPosition) { - newCigarElements.add(ce); - continue; - } - - int elementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int i = 0; i < elementLength; i++, currentPos++, basesToClip++) { - if ( currentPos > stopPosition ) { - newCigarElements.add(new CigarElement(elementLength - i, CigarOperator.M)); - break; - } - } - break; - case I: - case S: - basesToClip += elementLength; - break; - case D: - case N: - currentPos += elementLength; - break; - case H: - basesAlreadyClipped += elementLength; - case P: - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - int newLength = bases.length - basesToClip; - byte[] newBases = new byte[newLength]; - byte[] newQuals = new byte[newLength]; - System.arraycopy(bases, basesToClip, newBases, 0, newLength); - System.arraycopy(quals, basesToClip, newQuals, 0, newLength); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases - newCigarElements.addFirst(new CigarElement(basesToClip + basesAlreadyClipped, CigarOperator.H)); - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - - // adjust the start accordingly - read.setAlignmentStart(stopPosition + 1); - - return read; - } - - private static GATKSAMRecord hardClipEndOfRead(GATKSAMRecord oldRec, int startPosition) { - - if ( startPosition <= oldRec.getAlignmentStart() ) { - // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it - //System.out.printf("Entire read needs to be clipped: %50s %n", read.getReadName()); - return null; - } - - GATKSAMRecord read; - try { - read = (GATKSAMRecord)oldRec.clone(); - } catch (Exception e) { - return null; - } - - //System.out.printf("Clipping end of read: %50s adaptorStart=%d end=%d isize=%d %n", - // read.getReadName(), startPosition, read.getAlignmentEnd(), read.getInferredInsertSize()); - - Cigar oldCigar = read.getCigar(); - LinkedList newCigarElements = new LinkedList(); - int currentPos = read.getAlignmentStart(); - int basesToKeep = 0; - int basesAlreadyClipped = 0; - - for ( CigarElement ce : oldCigar.getCigarElements() ) { - - int elementLength = ce.getLength(); - - if ( currentPos >= startPosition ) { - if ( ce.getOperator() == CigarOperator.H ) - basesAlreadyClipped += elementLength; - continue; - } - - switch ( ce.getOperator() ) { - case M: - for (int i = 0; i < elementLength; i++, currentPos++, basesToKeep++) { - if ( currentPos == startPosition ) { - newCigarElements.add(new CigarElement(i, CigarOperator.M)); - break; - } - } - - if ( currentPos != startPosition ) - newCigarElements.add(ce); - break; - case I: - case S: - newCigarElements.add(ce); - basesToKeep += elementLength; - break; - case D: - case N: - newCigarElements.add(ce); - currentPos += elementLength; - break; - case H: - case P: - newCigarElements.add(ce); - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - byte[] newBases = new byte[basesToKeep]; - byte[] newQuals = new byte[basesToKeep]; - System.arraycopy(bases, 0, newBases, 0, basesToKeep); - System.arraycopy(quals, 0, newQuals, 0, basesToKeep); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases - newCigarElements.add(new CigarElement((bases.length - basesToKeep) + basesAlreadyClipped, CigarOperator.H)); - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - - // adjust the stop accordingly - // read.setAlignmentEnd(startPosition - 1); - - return read; - } - /** - * Hard clips away (i.e.g, removes from the read) bases that were previously soft clipped. + * is the read a 454 read ? * - * @param read - * @return + * @param read the read to test + * @return checks the read group tag PL for the default 454 tag */ - @Requires("read != null") - @Ensures("result != null") - public static GATKSAMRecord hardClipSoftClippedBases(GATKSAMRecord read) { - List cigarElts = read.getCigar().getCigarElements(); - - if ( cigarElts.size() == 1 ) // can't be soft clipped, just return - return read; - - int keepStart = 0, keepEnd = read.getReadLength() - 1; - List newCigarElements = new LinkedList(); - - for ( int i = 0; i < cigarElts.size(); i++ ) { - CigarElement ce = cigarElts.get(i); - int l = ce.getLength(); - switch ( ce.getOperator() ) { - case S: - if ( i == 0 ) - keepStart = l; - else - keepEnd = read.getReadLength() - l - 1; - newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP)); - break; - - default: - newCigarElements.add(ce); - break; - } - } - - // Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S - // this will happen if you soft clip a read that has been hard clipped before - // like: 5H20S => 5H20H - List mergedCigarElements = new LinkedList(); - Iterator cigarElementIterator = newCigarElements.iterator(); - CigarOperator currentOperator = null; - int currentOperatorLength = 0; - while (cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - if (currentOperator != cigarElement.getOperator()) { - if (currentOperator != null) - mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); - currentOperator = cigarElement.getOperator(); - currentOperatorLength = cigarElement.getLength(); - } - else - currentOperatorLength += cigarElement.getLength(); - } - mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); - - return hardClipBases(read, keepStart, keepEnd, mergedCigarElements); - } - - /** - * Hard clips out the bases in read, keeping the bases from keepStart to keepEnd, inclusive. Note these - * are offsets, so they are 0 based - * - * @param read - * @param keepStart - * @param keepEnd - * @param newCigarElements - * @return - */ - @Requires({ - "read != null", - "keepStart >= 0", - "keepEnd < read.getReadLength()", - "read.getReadUnmappedFlag() || newCigarElements != null"}) - @Ensures("result != null") - public static GATKSAMRecord hardClipBases(GATKSAMRecord read, int keepStart, int keepEnd, List newCigarElements) { - int newLength = keepEnd - keepStart + 1; - if ( newLength != read.getReadLength() ) { - try { - read = (GATKSAMRecord)read.clone(); - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - byte[] newBases = new byte[newLength]; - byte[] newQuals = new byte[newLength]; - System.arraycopy(bases, keepStart, newBases, 0, newLength); - System.arraycopy(quals, keepStart, newQuals, 0, newLength); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases, if the read isn't unmapped - if ( ! read.getReadUnmappedFlag() ) { - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - } - } catch ( CloneNotSupportedException e ) { - throw new ReviewedStingException("WTF, where did clone go?", e); - } - } - - return read; - } - - public static GATKSAMRecord replaceSoftClipsWithMatches(GATKSAMRecord read) { - List newCigarElements = new ArrayList(); - - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.SOFT_CLIP ) - newCigarElements.add(new CigarElement(ce.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - else - newCigarElements.add(ce); - } - - if ( newCigarElements.size() > 1 ) { // - CigarElement first = newCigarElements.get(0); - CigarElement second = newCigarElements.get(1); - if ( first.getOperator() == CigarOperator.MATCH_OR_MISMATCH && second.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) { - newCigarElements.set(0, new CigarElement(first.getLength() + second.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - newCigarElements.remove(1); - } - } - - if ( newCigarElements.size() > 1 ) { // - CigarElement penult = newCigarElements.get(newCigarElements.size()-2); - CigarElement last = newCigarElements.get(newCigarElements.size()-1); - if ( penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH && penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) { - newCigarElements.set(newCigarElements.size()-2, new CigarElement(penult.getLength() + last.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - newCigarElements.remove(newCigarElements.size()-1); - } - } - - read.setCigar(new Cigar(newCigarElements)); - return read; - } - - - - /** - * - * @param read original SAM record - * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped - */ - public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read) { - final Integer adaptorBoundary = getAdaptorBoundary(read); - - if (adaptorBoundary == null || !isInsideRead(read, adaptorBoundary)) - return read; - - return (read.getReadNegativeStrandFlag()) ? hardClipStartOfRead(read, adaptorBoundary) : hardClipEndOfRead(read, adaptorBoundary); - } - - public static boolean is454Read(SAMRecord read) { return isPlatformRead(read, "454"); } + /** + * is the read a SOLiD read ? + * + * @param read the read to test + * @return checks the read group tag PL for the default SOLiD tag + */ public static boolean isSOLiDRead(SAMRecord read) { return isPlatformRead(read, "SOLID"); } + /** + * is the read a SLX read ? + * + * @param read the read to test + * @return checks the read group tag PL for the default SLX tag + */ public static boolean isSLXRead(SAMRecord read) { return isPlatformRead(read, "ILLUMINA"); } - private static final Map readFlagNames - = new HashMap(); - - static { - readFlagNames.put(0x1, "Paired"); - readFlagNames.put(0x2, "Proper"); - readFlagNames.put(0x4, "Unmapped"); - readFlagNames.put(0x8, "MateUnmapped"); - readFlagNames.put(0x10, "Forward"); - //readFlagNames.put(0x20, "MateForward"); - readFlagNames.put(0x40, "FirstOfPair"); - readFlagNames.put(0x80, "SecondOfPair"); - readFlagNames.put(0x100, "NotPrimary"); - readFlagNames.put(0x200, "NON-PF"); - readFlagNames.put(0x400, "Duplicate"); - } - - public static String readFlagsAsString(GATKSAMRecord read) { - String flags = ""; - for (int flag : readFlagNames.keySet()) { - if ((read.getFlags() & flag) != 0) { - flags += readFlagNames.get(flag) + " "; - } + /** + * checks if the read has a platform tag in the readgroup equal to 'name' ? + * + * @param read the read to test + * @param name the platform name to test + * @return whether or not name == PL tag in the read group of read + */ + public static boolean isPlatformRead(SAMRecord read, String name) { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if (readGroup != null) { + Object readPlatformAttr = readGroup.getAttribute("PL"); + if (readPlatformAttr != null) + return readPlatformAttr.toString().toUpperCase().contains(name); } - return flags; + return false; } + /** * Returns the collections of reads sorted in coordinate order, according to the order defined * in the reads themselves @@ -556,12 +254,19 @@ public class ReadUtils { * @param reads * @return */ - public final static List coordinateSortReads(List reads) { + public final static List sortReadsByCoordinate(List reads) { final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); Collections.sort(reads, comparer); return reads; } + /** + * If a read starts in INSERTION, returns the first element length. + * + * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. + * @param read + * @return the length of the first insertion, or 0 if there is none (see warning). + */ public final static int getFirstInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(0); if ( e.getOperator() == CigarOperator.I ) @@ -570,8 +275,15 @@ public class ReadUtils { return 0; } + /** + * If a read ends in INSERTION, returns the last element length. + * + * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. + * @param read + * @return the length of the last insertion, or 0 if there is none (see warning). + */ public final static int getLastInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1); + CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); if ( e.getOperator() == CigarOperator.I ) return e.getLength(); else @@ -622,6 +334,7 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedStart(GATKSAMRecord read) { int start = read.getUnclippedStart(); @@ -803,7 +516,13 @@ public class ReadUtils { return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); } - private static boolean readIsEntirelyInsertion(GATKSAMRecord read) { + /** + * Is this read all insertion? + * + * @param read + * @return whether or not the only element in the cigar string is an Insertion + */ + public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() != CigarOperator.INSERTION) return false; From cadff4024721a22c4694a68d0fb5e369960e441e Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 20 Dec 2011 14:58:18 -0500 Subject: [PATCH 5/6] getRefCoordSoftUnclippedStart and End refactor These functions are methods of the read, and supplement getAlignmentStart() and getUnclippedStart() by calculating the unclipped start counting only soft clips. * Removed from ReadUtils * Added to GATKSAMRecord * Changed name to getSoftStart() and getSoftEnd * Updated third party code accordingly. --- .../sting/utils/sam/GATKSAMRecord.java | 53 +++++++++++++++++-- .../sting/utils/sam/ReadUtils.java | 42 ++------------- .../utils/clipping/ReadClipperUnitTest.java | 9 ++-- 3 files changed, 57 insertions(+), 47 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index d3a52167a..e8df869e6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -24,10 +24,8 @@ package org.broadinstitute.sting.utils.sam; -import net.sf.samtools.BAMRecord; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; +import com.google.java.contract.Ensures; +import net.sf.samtools.*; import org.broadinstitute.sting.utils.NGSPlatform; import java.util.HashMap; @@ -273,5 +271,52 @@ public class GATKSAMRecord extends BAMRecord { setReadGroup(rg); } + /** + * Calculates the reference coordinate for the beginning of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped start of the read taking soft clips (but not hard clips) into account + */ + @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) + public int getSoftStart() { + int start = this.getUnclippedStart(); + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + start += cigarElement.getLength(); + else + break; + } + return start; + } + + /** + * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped end of the read taking soft clips (but not hard clips) into account + */ + @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) + public int getSoftEnd() { + int stop = this.getUnclippedStart(); + + if (ReadUtils.readIsEntirelyInsertion(this)) + return stop; + + int shift = 0; + CigarOperator lastOperator = null; + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + stop += shift; + lastOperator = cigarElement.getOperator(); + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) + shift = cigarElement.getLength(); + else + shift = 0; + } + return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; + } + + } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 0c3433eba..24fd48387 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -299,8 +299,8 @@ public class ReadUtils { */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) { - int sStart = getRefCoordSoftUnclippedStart(read); - int sStop = getRefCoordSoftUnclippedEnd(read); + int sStart = read.getSoftStart(); + int sStop = read.getSoftEnd(); int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); @@ -334,40 +334,6 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) - public static int getRefCoordSoftUnclippedStart(GATKSAMRecord read) { - int start = read.getUnclippedStart(); - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - start += cigarElement.getLength(); - else - break; - } - return start; - } - - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) - public static int getRefCoordSoftUnclippedEnd(GATKSAMRecord read) { - int stop = read.getUnclippedStart(); - - if (readIsEntirelyInsertion(read)) - return stop; - - int shift = 0; - CigarOperator lastOperator = null; - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - stop += shift; - lastOperator = cigarElement.getOperator(); - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) - shift = cigarElement.getLength(); - else - shift = 0; - } - return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; - } - - /** * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in * a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns @@ -407,14 +373,14 @@ public class ReadUtils { * @param refCoord * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) */ - @Requires({"refCoord >= getRefCoordSoftUnclippedStart(read)", "refCoord <= getRefCoordSoftUnclippedEnd(read)"}) + @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; boolean fallsInsideDeletion = false; - int goal = refCoord - getRefCoordSoftUnclippedStart(read); // The goal is to move this many reference bases + int goal = refCoord - read.getSoftStart(); // The goal is to move this many reference bases boolean goalReached = refBases == goal; Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index e228762b7..d1d5ddd8f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -90,13 +89,13 @@ public class ReadClipperUnitTest extends BaseTest { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); for (int i=alnStart; i<=alnEnd; i++) { - if (ReadUtils.getRefCoordSoftUnclippedStart(read) == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); if (!clipLeft.isEmpty()) Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); } - if (ReadUtils.getRefCoordSoftUnclippedEnd(read) == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); @@ -111,7 +110,7 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); - if (ReadUtils.getRefCoordSoftUnclippedStart(read) == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side for (int i=alnStart; i<=alnEnd; i++) { GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); if (!clipLeft.isEmpty()) @@ -127,7 +126,7 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); - if (ReadUtils.getRefCoordSoftUnclippedEnd(read) == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side for (int i=alnStart; i<=alnEnd; i++) { GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. From 731a4634152e682a61d7acd268c3656ae8b2d1c0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 20 Dec 2011 17:44:21 -0500 Subject: [PATCH 6/6] Updated IntegrationTests with new adaptor clipper phew! --- .../walkers/PileupWalkerIntegrationTest.java | 5 ++- .../VariantAnnotatorIntegrationTest.java | 12 +++--- .../CallableLociWalkerIntegrationTest.java | 8 ++-- .../DepthOfCoverageB36IntegrationTest.java | 15 +++---- .../DepthOfCoverageIntegrationTest.java | 40 +++++++++---------- .../UnifiedGenotyperIntegrationTest.java | 22 +++++----- .../RecalibrationWalkersIntegrationTest.java | 30 +++++++------- .../sting/utils/sam/ReadUtilsUnitTest.java | 14 ------- 8 files changed, 69 insertions(+), 77 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index a01a3f281..8bea5385a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -29,9 +29,12 @@ public class PileupWalkerIntegrationTest extends WalkerTest { String gatk_args = "-T Pileup -I " + validationDataLocation + "OV-0930.normal.chunk.bam " + "-R " + hg18Reference + " -show_indels -o %s"; - String expected_md5="06eedc2e7927650961d99d703f4301a4"; + String expected_md5="da2a02d02abac9de14cc4b187d8595a1"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args,1,Arrays.asList(expected_md5)); executeTest("Testing the extended pileup with indel records included on a small chunk of Ovarian dataset with 20 indels (1 D, 19 I)", spec); + // before Adaptor clipping + // String expected_md5="06eedc2e7927650961d99d703f4301a4"; + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 245fda3c7..83c3d3a1e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fbb656369eaa48153d127bd12db59d8f")); + Arrays.asList("ac5f409856a1b79316469733e62abb91")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("2977bb30c8b84a5f4094fe6090658561")); + Arrays.asList("f9aa7bee5a61ac1a9187d0cf1e8af471")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("42dd979a0a931c18dc9be40308bac321")); + Arrays.asList("6f27fd863b6718d59d2a2d8e2a20bcae")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("0948cd1dba7d61f283cc4cf2a7757d92")); + Arrays.asList("40bbd3d5a2397a007c0e74211fb33433")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("477eac07989593b58bb361f3429c085a")); + Arrays.asList("40622d39072b298440a77ecc794116e7")); executeTest("test exclude annotations", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("062155edec46a8c52243475fbf3a2943")); + Arrays.asList("31faae1bc588d195ff553cf6c47fabfa")); executeTest("test overwriting header", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java index 3783525d1..186e5c3b7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java @@ -32,13 +32,13 @@ import java.util.Arrays; public class CallableLociWalkerIntegrationTest extends WalkerTest { final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; - final static String SUMMARY_MD5 = "ed4c255bb78313b8e7982127caf3d6c4"; + final static String SUMMARY_MD5 = "cd597a8dae35c226a2cb110b1c9f32d5"; @Test public void testCallableLociWalkerBed() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("884c9c2d96419d990a708d2bd98fcefa", SUMMARY_MD5)); + Arrays.asList("c86ac1ef404c11d5e5452e020c8f7ce9", SUMMARY_MD5)); executeTest("formatBed", spec); } @@ -46,7 +46,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { public void testCallableLociWalkerPerBase() { String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("fb4524f8b3b213060c0c5b85362b5902", SUMMARY_MD5)); + Arrays.asList("d8536a55fe5f6fdb1ee6c9511082fdfd", SUMMARY_MD5)); executeTest("format_state_per_base", spec); } @@ -62,7 +62,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { public void testCallableLociWalker3() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("86bd1a5f79356b3656412c4b1c60709a", "6fefb144a60b89c27293ce5ca6e10e6a")); + Arrays.asList("bc966060184bf4605a31da7fe383464e", "d624eda8f6ed14b9251ebeec73e37867")); executeTest("formatBed lots of arguments", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java index 043b2eaf2..f4cd4968b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java @@ -61,13 +61,14 @@ public class DepthOfCoverageB36IntegrationTest extends WalkerTest { File baseOutputFile = this.createTempFile("depthofcoveragemapq0",".tmp"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("f39af6ad99520fd4fb27b409ab0344a0",baseOutputFile); - spec.addAuxFile("6b15f5330414b6d4e2f6caea42139fa1", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); - spec.addAuxFile("cc6640d82077991dde8a2b523935cdff", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("5b6c16a1c667c844882e9dce71454fc4",baseOutputFile); + spec.addAuxFile("fc161ec1b61dc67bc6a5ce36cb2d02c9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); + spec.addAuxFile("89321bbfb76a4e1edc0905d50503ba1f", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("0fb627234599c258a3fee1b2703e164a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("cb73a0fa0cee50f1fb8f249315d38128", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("347b47ef73fbd4e277704ddbd7834f69", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("4ec920335d4b9573f695c39d62748089", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + spec.addAuxFile("4dd16b659065e331ed4bd3ab0dae6c1b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("2be0c18b501f4a3d8c5e5f99738b4713", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("5a26ef61f586f58310812580ce842462", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + execute("testMapQ0Only",spec); } @@ -83,7 +84,7 @@ public class DepthOfCoverageB36IntegrationTest extends WalkerTest { File baseOutputFile = this.createTempFile("testManySamples",".tmp"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("c9561b52344536d2b06ab97b0bb1a234",baseOutputFile); + spec.addAuxFile("d73fa1fc492f7dcc1d75056f8c12c92a",baseOutputFile); execute("testLotsOfSamples",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 646fb5e77..9d6638d53 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -55,26 +55,26 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.setOutputFileLocation(baseOutputFile); // now add the expected files that get generated - spec.addAuxFile("423571e4c05e7934322172654ac6dbb7", baseOutputFile); - spec.addAuxFile("9df5e7e07efeb34926c94a724714c219", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); - spec.addAuxFile("229b9b5bc2141c86dbc69c8acc9eba6a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); - spec.addAuxFile("9cd395f47b329b9dd00ad024fcac9929", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); - spec.addAuxFile("471c34ad2e4f7228efd20702d5941ba9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); - spec.addAuxFile("9667c77284c2c08e647b162d0e9652d4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); - spec.addAuxFile("5a96c75f96d6fa6ee617451d731dae37", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); - spec.addAuxFile("b82846df660f0aac8429aec57c2a62d6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); - spec.addAuxFile("d32a8c425fadcc4c048bd8b48d0f61e5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); - spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); - spec.addAuxFile("2aae346204c5f15517158da8e61a6c16", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); - spec.addAuxFile("e70952f241eebb9b5448f2e7cb288131", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); - spec.addAuxFile("054ed1e184f46d6a170dc9bf6524270c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); - spec.addAuxFile("d53431022f7387fe9ac47814ab1fcd88", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); - spec.addAuxFile("a395dafde101971d2b9e5ddb6cd4b7d0", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); - spec.addAuxFile("df0ba76e0e6082c0d29fcfd68efc6b77", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("e013cb5b11b0321a81c8dbd7c1863787", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("661160f571def8c323345b5859cfb9da", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("c95a7a6840334cadd0e520939615c77b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); - + spec.addAuxFile("19e862f7ed3de97f2569803f766b7433", baseOutputFile); + spec.addAuxFile("c64cc5636d4880b80b71169ed1832cd7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); + spec.addAuxFile("1a8ba07a60e55f9fdadc89d00b1f3394", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); + spec.addAuxFile("0075cead73a901e3a9d07c5d9c2b75f4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); + spec.addAuxFile("d757be2f953f893e66eff1ef1f0fff4e", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("de08996729c774590d6a4954c906fe84", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); + spec.addAuxFile("58ad39b100d1f2af7d119f28ba626bfd", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("0b4ce6059e6587ae5a986afbbcc7d783", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); + spec.addAuxFile("adc2b2babcdd72a843878acf2d510ca7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); + spec.addAuxFile("884281c139241c5db3c9f90e8684d084", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); + spec.addAuxFile("b90636cad74ff4f6b9ff9a596e145bd6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); + spec.addAuxFile("ad540b355ef90c566bebaeabd70026d2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); + spec.addAuxFile("27fe09a02a5b381e0ed633587c0f4b23", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); + spec.addAuxFile("5fcd53b4bd167b5e6d5f92329cf8678e", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); + spec.addAuxFile("7a2a19e54f73a8e07de2f020f1f913dd", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("852a079c5e9e93e7daad31fd6a9f4a49", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); + spec.addAuxFile("0828762842103edfaf115ef4e50809c6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("5c5aeb28419bba1decb17f8a166777f2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("e5fd6216b3d6a751f3a90677b4e5bf3c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + execute("testBaseOutputNoFiltering",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e049af064..ba33108cf 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("a2d3839c4ebb390b0012d495e4e53b3a")); + Arrays.asList("66ed60c6c1190754abd8a0a9d1d8d61e")); executeTest("test MultiSample Pilot1", spec); } @@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "d971d392956aea69c3707da64d24485b" ); - e.put( "--output_mode EMIT_ALL_SITES", "21993e71ca1a06a0d1f11d58e3cc26c3" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "42e4ea7878ef8d96215accb3ba4e97b7" ); + e.put( "--output_mode EMIT_ALL_SITES", "e0443c720149647469f2a2f3fb73942f" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -189,7 +189,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("f0fbe472f155baf594b1eeb58166edef")); + Arrays.asList("2b2729414ae855d390e7940956745bce")); executeTest(String.format("test multiple technologies"), spec); } @@ -208,7 +208,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("8c87c749a7bb5a76ed8504d4ec254272")); + Arrays.asList("95c6120efb92e5a325a5cec7d77c2dab")); executeTest(String.format("test calling with BAQ"), spec); } @@ -227,7 +227,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("a64d2e65b5927260e4ce0d948760cc5c")); + Arrays.asList("d87ce4b405d4f7926d1c36aee7053975")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -255,7 +255,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("69107157632714150fc068d412e31939")); + Arrays.asList("c5989e5d67d9e5fe8c5c956f12a975da")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -265,7 +265,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("4ffda07590e06d58ed867ae326d74b2d")); + Arrays.asList("daca0741278de32e507ad367e67753b6")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("45633d905136c86e9d3f90ce613255e5")); + Arrays.asList("0ccc4e876809566510429c64adece2c7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("8bd5028cf294850b8a95b3c0af23d728")); + Arrays.asList("cff6dd0f4eb1ef0b6fc476da6ffead19")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("814dcd66950635a870602d90a1618cce")); + Arrays.asList("1e2a4aab26e9ab0dae709d33a669e036")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index cbcd5835f..b4b1f7b8e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -31,10 +31,11 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @DataProvider(name = "cctestdata") public Object[][] createCCTestData() { - new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "5a52b00d9794d27af723bcf93366681e" ); + + new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "9469d6b65880abe4e5babc1c1a69889d" ); new CCTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "17d4b8001c982a70185e344929cf3941"); - new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "714e65d6cb51ae32221a77ce84cbbcdc" ); - new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "64e9f17a1cf6fc04c1f2717c2d2eca67" ); + new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "36c0c467b6245c2c6c4e9c956443a154" ); + new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "ed15f8bf03bb2ea9b7c26844be829c0d" ); return CCTest.getTests(CCTest.class); } @@ -88,10 +89,11 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @DataProvider(name = "trtestdata") public Object[][] createTRTestData() { - new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); + new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "f020725d9f75ad8f1c14bfae056e250f" ); new TRTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "d04cf1f6df486e45226ebfbf93a188a5"); - new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "74314e5562c1a65547bb0edaacffe602" ); - new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "2a37c6001826bfabf87063b1dfcf594f" ); + new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b2f4757bc47cf23bd9a09f756c250787" ); + new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "313a21a8a88e3460b6e71ec5ffc50f0f" ); + return TRTest.getTests(TRTest.class); } @@ -121,7 +123,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesUseOriginalQuals() { HashMap e = new HashMap(); - e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "278846c55d97bd9812b758468a83f559"); + e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "bd8288b1fc7629e2e8c2cf7f65fefa8f"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -145,7 +147,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorMaxQ70() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "f020725d9f75ad8f1c14bfae056e250f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -174,7 +176,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "8379f24cf5312587a1f92c162ecc220f" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1f643bca090478ba68aac88db835a629" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -200,7 +202,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "2ad4c17ac3ed380071137e4e53a398a5" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "613fb2bbe01af8cbe6a188a10c1582ca" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -228,7 +230,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesBED() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "b460478d9683e827784e42bc352db8bb"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "b00e99219aeafe2516c6232b7d6a0a00"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -252,7 +254,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCFPlusDBsnp() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "9131d96f39badbf9753653f55b148012"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "7b92788ce92f49415af3a75a2e4a2b33"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -280,7 +282,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "8993d32df5cb66c7149f59eccbd57f4c" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "f34f7141351a5dbf9664c67260f94e96" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -306,7 +308,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "5f913c98ca99754902e9d34f99df468f" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "13c83656567cee9e93bda9874ee80234" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 3878cbfa9..e9269ff48 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -33,20 +33,6 @@ public class ReadUtilsUnitTest extends BaseTest { reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); } - private void testReadBasesAndQuals(GATKSAMRecord read, int expectedStart, int expectedStop) { - SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null); - String expectedBases = BASES.substring(expectedStart, expectedStop); - String expectedQuals = QUALS.substring(expectedStart, expectedStop); - Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected"); - Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected"); - } - - @Test public void testNoClip() { testReadBasesAndQuals(read, 0, 4); } - @Test public void testClip1Front() { testReadBasesAndQuals(read, 1, 4); } - @Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); } - @Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); } - @Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); } - @Test public void testReducedReads() { Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read");