From 5f8574bd1594771cffe11f8fb5b2cc8a62a05b92 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 24 Aug 2012 10:48:41 -0400 Subject: [PATCH 01/48] Fixing typo in error message. --- public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 41ca58157..f8faa101b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -141,7 +141,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } if (!(this.contiguousP(that))) { - throw new ReviewedStingException("The two genome loc's need to be contigous"); + throw new ReviewedStingException("The two genome loc's need to be contiguous"); } return new GenomeLoc(getContig(), this.contigIndex, From ce55ba98f4b1fec0c84047168a8edda0cc94a033 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 29 Aug 2012 15:01:11 -0400 Subject: [PATCH 02/48] Don't try to left align indels in unmapped reads (which for some reason can still have CIGARs) because the ref context is null. --- .../broadinstitute/sting/gatk/contexts/ReferenceContext.java | 2 +- .../sting/gatk/walkers/indels/LeftAlignIndels.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index 1290319e2..af330bba9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -177,7 +177,7 @@ public class ReferenceContext { * @return The base at the given locus from the reference. */ public byte getBase() { - return getBases()[(int)(locus.getStart() - window.getStart())]; + return getBases()[(locus.getStart() - window.getStart())]; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index b08def44f..6b9bd04d2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -82,7 +82,7 @@ public class LeftAlignIndels extends ReadWalker { public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { // we can not deal with screwy records - if ( read.getCigar().numCigarElements() == 0 ) { + if ( read.getReadUnmappedFlag() || read.getCigar().numCigarElements() == 0 ) { emit(read); return 0; } From 150a9692797744d4b0147d6f49b07d3522b8ec22 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 29 Aug 2012 15:13:28 -0400 Subject: [PATCH 03/48] Be careful with String manipulation when constructing alleles in SomaticIndelDetector --- .../sting/gatk/walkers/indels/SomaticIndelDetector.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java index b0c09f78e..3965a63fb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java @@ -1181,10 +1181,10 @@ public class SomaticIndelDetector extends ReadWalker { if ( event_length == 0 ) { // insertion l.add( Allele.create(referencePaddingBase,true) ); - l.add( Allele.create(referencePaddingBase + call.getVariant().getBases(), false )); + l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), false )); } else { //deletion: - l.add( Allele.create(referencePaddingBase + call.getVariant().getBases(), true )); + l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), true )); l.add( Allele.create(referencePaddingBase,false) ); } } From 1acf0f0b2cd62c16e35d496c1eb0d23f9b9c480f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 29 Aug 2012 22:36:27 -0400 Subject: [PATCH 04/48] Fixing bug in fasta .fai generation: trim the contig names to the first whitespace if one appears. We now generate indexes identical to samtools. --- .../sf/picard/reference/FastaSequenceIndexBuilder.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java index 10326ef2e..507d4b786 100644 --- a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java +++ b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java @@ -245,7 +245,7 @@ public class FastaSequenceIndexBuilder { * Reset iterators and add contig to sequence index */ private void finishReadingContig(FastaSequenceIndex sequenceIndex) { - sequenceIndex.add(new FastaSequenceIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++)); + sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++)); status = Status.NONE; contig = ""; size = 0; @@ -258,6 +258,14 @@ public class FastaSequenceIndexBuilder { } } + /* + * Trims the contig name to the expected value by removing any characters after the first whitespace + */ + private static String trimContigName(final String contigName) { + int whitespaceIndex = contigName.indexOf(' '); + return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex); + } + /** * Stores FastaSequenceIndex as a .fasta.fai file on local machine * Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder From 35baf0b15542b77dcaf702f2b3c1d990bbad4e27 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Aug 2012 09:07:36 -0400 Subject: [PATCH 05/48] This along with Mauricio's previous commit (thanks!) fixes GSA-522. There are no longer any modifications to reads in the map calls of ActiveRegion walkers. Added the bam which identified this error as a new integration test. --- .../HaplotypeCallerIntegrationTest.java | 8 ++++++++ .../gatk/traversals/TraverseActiveRegions.java | 1 + .../sting/utils/sam/GATKSAMRecord.java | 16 +++++++--------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 2ae1f2ca5..c1a1e065a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -66,4 +66,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testHaplotypeCallerSingleSampleIndelQualityScores() { HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e1f88fac91424740c0eaac1de48b3970"); } + + @Test + public void HCTestProblematicReadsModifiedInActiveRegions() { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("000fd36d5cf8090386bb2ac15e3ab0b5")); + executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 67de427e8..af981e676 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -241,6 +241,7 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); final M x = walker.map( activeRegion, null ); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c9b3a2df8..53e6dc0dc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -228,8 +228,7 @@ public class GATKSAMRecord extends BAMRecord { if( quals == null ) { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, EventType.BASE_INSERTION); + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 } return quals; } @@ -246,7 +245,6 @@ public class GATKSAMRecord extends BAMRecord { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, EventType.BASE_DELETION); } return quals; } @@ -262,7 +260,7 @@ public class GATKSAMRecord extends BAMRecord { public void setReadGroup( final GATKSAMReadGroupRecord readGroup ) { mReadGroup = readGroup; retrievedReadGroup = true; - setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! + setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! } /////////////////////////////////////////////////////////////////////////////// @@ -367,15 +365,15 @@ public class GATKSAMRecord extends BAMRecord { * Clears all attributes except ReadGroup of the read. */ public GATKSAMRecord simplify () { - GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information + GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information byte[] insQuals = (this.getAttribute(BQSR_BASE_INSERTION_QUALITIES) == null) ? null : getBaseInsertionQualities(); byte[] delQuals = (this.getAttribute(BQSR_BASE_DELETION_QUALITIES) == null) ? null : getBaseDeletionQualities(); - this.clearAttributes(); // clear all attributes from the read - this.setReadGroup(rg); // restore read group + this.clearAttributes(); // clear all attributes from the read + this.setReadGroup(rg); // restore read group if (insQuals != null) - this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any + this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any if (delQuals != null) - this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any + this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any return this; } From 57d997f06f9286aae0ff2c59eeab5dbaa2a44d88 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Aug 2012 10:10:43 -0400 Subject: [PATCH 06/48] Fixing bug from when FragmentUtils merging function moved over to the soft clipped start instead of the unclipped start --- .../HaplotypeCallerIntegrationTest.java | 1 - .../sting/utils/fragments/FragmentUtils.java | 26 +++---------------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c1a1e065a..b5359af46 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -73,5 +73,4 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("000fd36d5cf8090386bb2ac15e3ab0b5")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 2f31c154c..a4a5d578a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -128,22 +128,13 @@ public class FragmentUtils { return create(reads, reads.size(), SamRecordGetter); } - public final static List mergeOverlappingPairedFragments( List overlappingPair ) { + public final static List mergeOverlappingPairedFragments( final List overlappingPair ) { final byte MIN_QUAL_BAD_OVERLAP = 16; if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } GATKSAMRecord firstRead = overlappingPair.get(0); GATKSAMRecord secondRead = overlappingPair.get(1); - /* - System.out.println("read 0 unclipped start:"+overlappingPair.get(0).getUnclippedStart()); - System.out.println("read 0 unclipped end:"+overlappingPair.get(0).getUnclippedEnd()); - System.out.println("read 1 unclipped start:"+overlappingPair.get(1).getUnclippedStart()); - System.out.println("read 1 unclipped end:"+overlappingPair.get(1).getUnclippedEnd()); - System.out.println("read 0 start:"+overlappingPair.get(0).getAlignmentStart()); - System.out.println("read 0 end:"+overlappingPair.get(0).getAlignmentEnd()); - System.out.println("read 1 start:"+overlappingPair.get(1).getAlignmentStart()); - System.out.println("read 1 end:"+overlappingPair.get(1).getAlignmentEnd()); - */ + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { firstRead = overlappingPair.get(1); // swap them secondRead = overlappingPair.get(0); @@ -155,15 +146,6 @@ public class FragmentUtils { return overlappingPair; // fragments contain indels so don't merge them } -/* // check for inconsistent start positions between uncliped/soft alignment starts - if (secondRead.getAlignmentStart() >= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() < firstRead.getUnclippedStart()) - return overlappingPair; - if (secondRead.getAlignmentStart() <= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() > firstRead.getUnclippedStart()) - return overlappingPair; - - if (secondRead.getUnclippedStart() < firstRead.getAlignmentEnd() && secondRead.getAlignmentStart() >= firstRead.getAlignmentEnd()) - return overlappingPair; - */ final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); @@ -183,7 +165,7 @@ public class FragmentUtils { } for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { - return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + return overlappingPair; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them } if( firstReadQuals[iii] < MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] < MIN_QUAL_BAD_OVERLAP ) { return overlappingPair; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on @@ -197,7 +179,7 @@ public class FragmentUtils { } final GATKSAMRecord returnRead = new GATKSAMRecord( firstRead.getHeader() ); - returnRead.setAlignmentStart( firstRead.getUnclippedStart() ); + returnRead.setAlignmentStart( firstRead.getSoftStart() ); returnRead.setReadBases( bases ); returnRead.setBaseQualities( quals ); returnRead.setReadGroup( firstRead.getReadGroup() ); From 8fc6a0a68b8073c1ec83e3bf983c18c60d13a016 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 09:25:05 -0400 Subject: [PATCH 08/48] Cleanup RefMetaDataTracker before refactoring ReadMetaDataTracker --- .../sting/commandline/RodBinding.java | 9 +++ .../ManagingReferenceOrderedView.java | 2 +- .../datasources/providers/RodLocusView.java | 2 +- .../gatk/refdata/RefMetaDataTracker.java | 69 ++++++++----------- .../ReferenceOrderedViewUnitTest.java | 7 +- .../refdata/RefMetaDataTrackerUnitTest.java | 2 +- 6 files changed, 44 insertions(+), 47 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java index e0b1154c4..15d134fa2 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java @@ -117,6 +117,15 @@ public final class RodBinding { this.bound = true; } + /** + * For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName + * @param type + * @param rawName + */ + public RodBinding(Class type, final String rawName) { + this(type, rawName, "missing", type.getSimpleName(), new Tags()); + } + /** * Make an unbound RodBinding. Only available for creating the globally unique UNBOUND object * @param type class this unbound RodBinding creates diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java index d065635c8..080ac6686 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java @@ -58,7 +58,7 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView { // todo -- warning, I removed the reference to the name from states bindings.add( state.iterator.seekForward(loc) ); - return new RefMetaDataTracker(bindings, referenceContext); + return new RefMetaDataTracker(bindings); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java index 54f8b44ed..4be7c63c8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -101,7 +101,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { // special case the interval again -- add it into the ROD if ( interval != null ) { allTracksHere.add(interval); } - return new RefMetaDataTracker(allTracksHere, referenceContext); + return new RefMetaDataTracker(allTracksHere); } public boolean hasNext() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 2c2ee51bb..7e32ec112 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -5,7 +5,6 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; @@ -32,11 +31,10 @@ import java.util.*; * Time: 3:05:23 PM */ public class RefMetaDataTracker { - // TODO: this should be a list, not a map, actually + // TODO: this should be a list, not a bindings, actually private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); - final Map map; - final ReferenceContext ref; + final Map bindings; final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); // ------------------------------------------------------------------------------------------ @@ -48,28 +46,25 @@ public class RefMetaDataTracker { // ------------------------------------------------------------------------------------------ /** - * Only for testing -- not accesssible in any other context + * Create an tracker with no bindings */ public RefMetaDataTracker() { - ref = null; - map = Collections.emptyMap(); + bindings = Collections.emptyMap(); } - public RefMetaDataTracker(final Collection allBindings, final ReferenceContext ref) { - this.ref = ref; - - // set up the map + public RefMetaDataTracker(final Collection allBindings) { + // set up the bindings if ( allBindings.isEmpty() ) - map = Collections.emptyMap(); + bindings = Collections.emptyMap(); else { - Map tmap = new HashMap(allBindings.size()); + final Map tmap = new HashMap(allBindings.size()); for ( RODRecordList rod : allBindings ) { if ( rod != null && ! rod.isEmpty() ) tmap.put(canonicalName(rod.getName()), rod); } - // ensure that no one modifies the map itself - map = Collections.unmodifiableMap(tmap); + // ensure that no one modifies the bindings itself + bindings = Collections.unmodifiableMap(tmap); } } @@ -99,7 +94,7 @@ public class RefMetaDataTracker { @Requires({"type != null"}) @Ensures("result != null") public List getValues(final Class type) { - return addValues(map.keySet(), type, new ArrayList(), null, false, false); + return addValues(bindings.keySet(), type, new ArrayList(), null, false, false); } /** @@ -114,7 +109,7 @@ public class RefMetaDataTracker { @Requires({"type != null", "onlyAtThisLoc != null"}) @Ensures("result != null") public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { - return addValues(map.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); + return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); } /** @@ -296,7 +291,7 @@ public class RefMetaDataTracker { */ @Requires({"rodBinding != null"}) public boolean hasValues(final RodBinding rodBinding) { - return map.containsKey(canonicalName(rodBinding.getName())); + return bindings.containsKey(canonicalName(rodBinding.getName())); } /** @@ -306,7 +301,7 @@ public class RefMetaDataTracker { * @return List of all tracks */ public List getBoundRodTracks() { - return new ArrayList(map.values()); + return new ArrayList(bindings.values()); } /** @@ -314,38 +309,30 @@ public class RefMetaDataTracker { * @return the number of tracks with at least one bound Feature */ public int getNTracksWithBoundFeatures() { - return map.size(); + return bindings.size(); } // ------------------------------------------------------------------------------------------ - // - // - // old style accessors - // - // TODO -- DELETE ME - // - // + // Protected accessors using strings for unit testing // ------------------------------------------------------------------------------------------ - @Deprecated - public boolean hasValues(final String name) { - return map.containsKey(canonicalName(name)); + protected boolean hasValues(final String name) { + return bindings.containsKey(canonicalName(name)); } - @Deprecated - public List getValues(final Class type, final String name) { + protected List getValues(final Class type, final String name) { return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); } - @Deprecated - public List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + + protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); } - @Deprecated - public T getFirstValue(final Class type, final String name) { + + protected T getFirstValue(final Class type, final String name) { return safeGetFirst(getValues(type, name)); } - @Deprecated - public T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + + protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { return safeGetFirst(getValues(type, name, onlyAtThisLoc)); } @@ -366,7 +353,7 @@ public class RefMetaDataTracker { * @return */ @Requires({"l != null"}) - final private T safeGetFirst(final List l) { + private T safeGetFirst(final List l) { return l.isEmpty() ? null : l.get(0); } @@ -435,7 +422,7 @@ public class RefMetaDataTracker { */ private RODRecordList getTrackDataByName(final String name) { final String luName = canonicalName(name); - RODRecordList l = map.get(luName); + RODRecordList l = bindings.get(luName); return l == null ? EMPTY_ROD_RECORD_LIST : l; } @@ -448,7 +435,7 @@ public class RefMetaDataTracker { * @param name the name of the rod * @return canonical name of the rod */ - private final String canonicalName(final String name) { + private String canonicalName(final String name) { // todo -- remove me after switch to RodBinding syntax return name.toLowerCase(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index d75beae23..11a7b4cf7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; @@ -89,7 +90,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); - TableFeature datum = tracker.getFirstValue(TableFeature.class, "tableTest"); + TableFeature datum = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest")); Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect"); Assert.assertEquals(datum.get("COL2"),"D","datum parameter for COL2 is incorrect"); @@ -115,13 +116,13 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); - TableFeature datum1 = tracker.getFirstValue(TableFeature.class, "tableTest1"); + TableFeature datum1 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest1")); Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect"); Assert.assertEquals(datum1.get("COL2"),"D","datum1 parameter for COL2 is incorrect"); Assert.assertEquals(datum1.get("COL3"),"E","datum1 parameter for COL3 is incorrect"); - TableFeature datum2 = tracker.getFirstValue(TableFeature.class, "tableTest2"); + TableFeature datum2 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest2")); Assert.assertEquals(datum2.get("COL1"),"C","datum2 parameter for COL1 is incorrect"); Assert.assertEquals(datum2.get("COL2"),"D","datum2 parameter for COL2 is incorrect"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java index 91c18078e..2f73e373c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java @@ -133,7 +133,7 @@ public class RefMetaDataTrackerUnitTest { List x = new ArrayList(); if ( AValues != null ) x.add(AValues); if ( BValues != null ) x.add(BValues); - return new RefMetaDataTracker(x, context); + return new RefMetaDataTracker(x); } public int nBoundTracks() { From 972be8b4a4babce3b198f5c871e2359130696b6e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 16:58:21 -0400 Subject: [PATCH 09/48] Part I of GSA-462: Consistent RODBinding access across Ref and Read trackers -- ReadMetaDataTracker is dead! Long live the RefMetaDataTracker. Read walkers will soon just take RefMetaDataTracker objects. In this commit they take a class that trivially extends them -- Rewrote ReadBasedReferenceOrderedView to produce RefMetaDataTrackers not the old class. -- This new implementation produces thread-safe objects (i.e., holds no points to shared state). Suitable for use (to be tested) with nano scheduling -- Simplified interfaces to use the simplest data structures (PeekableIterator) not the LocusAwareSeekableIterator, since I both hate those classes and this is on the long term trajectory to remove those from the GATK entirely. -- Massively expanded DataProvider unit tests for ReadBasedReferenceOrderedView -- Note that the old implementation of offset -> ROD in ReadRefMetaDataTracker was broken for any read not completely matching the reference. Rather than provide broken code the ReadMetaDataTracker only provides a "bag of RODs" interface. If you want to work with the relationship between the read and the RODs in your tool you need to manage the CIGAR element itself. -- This commit breaks the new read walker BQSR, but Ryan knows this is coming -- Subsequent commit will be retiring / fixing ValidateRODForReads --- .../IntervalOverlappingRODsFromStream.java | 143 ++++++ .../ReadBasedReferenceOrderedView.java | 210 ++++----- .../gatk/refdata/ReadMetaDataTracker.java | 140 +----- .../gatk/walkers/indels/IndelRealigner.java | 15 +- .../broadinstitute/sting/utils/GenomeLoc.java | 9 + ...ReadBasedReferenceOrderedViewUnitTest.java | 438 ++++++++++++------ .../refdata/ReadMetaDataTrackerUnitTest.java | 276 ----------- 7 files changed, 537 insertions(+), 694 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java new file mode 100644 index 000000000..1e39d6836 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java @@ -0,0 +1,143 @@ +package org.broadinstitute.sting.gatk.datasources.providers; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * Key algorithmic helper for ReadBasedReferenceOrderedData + * + * Takes a single iterator of features, and provides a single capability that returns + * the list of RODs that overlap an interval. Allows sequential getOverlapping calls + * from intervals provided that these intervals always have increasing getStart() values. + * + */ +class IntervalOverlappingRODsFromStream { + /** + * Only held for QC purposes + */ + GenomeLoc lastQuery = null; + + private final String name; + private final LinkedList currentFeatures = new LinkedList(); + private final PeekableIterator futureFeatures; + + /** + * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and + * returns RODRecordLists having name + * + * @param name + * @param futureFeatures + */ + IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) { + if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null"); + + this.name = name; + this.futureFeatures = futureFeatures; + } + + /** + * Get the list of RODs overlapping loc from this stream of RODs. + * + * Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart + * + * @param loc the interval to query + * @return a non-null RODRecordList containing the overlapping RODs, which may be empty + */ + @Ensures({"overlaps(loc, result)", + "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)", + "result != null"}) + public RODRecordList getOverlapping(final GenomeLoc loc) { + if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) + throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); + + trimCurrentFeaturesToLoc(loc); + readOverlappingFutureFeatures(loc); + return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); + } + + + /** + * For contract assurance. Checks that all bindings in loc overlap + * + * @param loc + * @param bindings + * @return + */ + @Requires({"loc != null", "bindings != null"}) + private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) { + for ( final GATKFeature feature : bindings ) + if ( ! feature.getLocation().overlapsP(loc) ) + return false; + return true; + } + + /** + * Subset the features in all to those that overlap with loc + * + * The current features list contains everything read that cannot be thrown away yet, but not + * everything in there necessarily overlaps with loc. Subset to just those that do overlap + * + * @param loc the location that features must overlap + * @param all the list of all features + * @return a subset of all that overlaps with loc + */ + @Requires({"loc != null", "all != null"}) + @Ensures("result.size() <= all.size()") + private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) { + final LinkedList overlapping = new LinkedList(); + for ( final GATKFeature feature : all ) + if ( feature.getLocation().overlapsP(loc) ) + overlapping.add(feature); + return overlapping; + } + + /** + * Update function. Remove all elements of currentFeatures that end before loc + * + * @param loc the location to use + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() <= old(currentFeatures.size())") + private void trimCurrentFeaturesToLoc(final GenomeLoc loc) { + final ListIterator it = currentFeatures.listIterator(); + while ( it.hasNext() ) { + final GATKFeature feature = it.next(); + if ( feature.getLocation().isBefore(loc) ) + it.remove(); + } + } + + /** + * Update function: Read all elements from futureFeatures that overlap with loc + * + * Stops at the first element that starts before the end of loc, or the stream empties + * + * @param loc + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() >= old(currentFeatures.size())") + private void readOverlappingFutureFeatures(final GenomeLoc loc) { + while ( futureFeatures.hasNext() ) { + final GenomeLoc nextLoc = futureFeatures.peek().getLocation(); + if ( nextLoc.isBefore(loc) ) { + futureFeatures.next(); // next rod element is before loc, throw it away and keep looking + } else if ( nextLoc.isPast(loc) ) { + break; // next element is past loc, stop looking but don't pop it + } else if ( nextLoc.overlapsP(loc) ) { + // add overlapping elements to our current features, removing from stream + for ( final GATKFeature feature : futureFeatures.next() ) { + currentFeatures.add(feature); + } + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java index 01e24df67..054758101 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java @@ -23,40 +23,63 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import java.util.ArrayList; import java.util.Collection; import java.util.List; -import java.util.TreeMap; /** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */ public class ReadBasedReferenceOrderedView implements View { - private final WindowedData window; + // a list of the RMDDataState (location->iterators) + private final List states = new ArrayList(1); + private final static ReadMetaDataTracker EMPTY_TRACKER = new ReadMetaDataTracker(); - public ReadBasedReferenceOrderedView(ShardDataProvider provider) { - window = new WindowedData(provider); + /** + * Used to get genome locs for reads + */ + private final GenomeLocParser genomeLocParser; + + /** + * The total extent of all reads in this span. We create iterators from our RODs + * from the start of this span, to the end. + */ + private final GenomeLoc shardSpan; + + public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { + this(provider.getGenomeLocParser(), provider.getShard().getLocation()); provider.register(this); + + if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) { + for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) + states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan))); + } + } + + private ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser, final GenomeLoc shardSpan) { + this.genomeLocParser = genomeLocParser; + this.shardSpan = shardSpan; } /** - * for testing only please - * - * @param data the window provider + * Testing constructor */ - ReadBasedReferenceOrderedView(WindowedData data) { - window = data; - } - - public ReadMetaDataTracker getReferenceOrderedDataForRead(SAMRecord read) { - return window.getTracker(read); + protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser, + final GenomeLoc shardSpan, + final List names, + final List> featureSources) { + this(genomeLocParser, shardSpan); + for ( int i = 0; i < names.size(); i++ ) + states.add(new RMDDataState(names.get(i), featureSources.get(i))); } public Collection> getConflictingViews() { @@ -65,74 +88,6 @@ public class ReadBasedReferenceOrderedView implements View { return classes; } - public void close() { - if (window != null) window.close(); - } -} - - -/** stores a window of data, dropping RODs if we've passed the new reads start point. */ -class WindowedData { - // the queue of possibly in-frame RODs; RODs are removed as soon as they are out of scope - private final TreeMap mapping = new TreeMap(); - - // our current location from the last read we processed - private GenomeLoc currentLoc; - - // a list of the RMDDataState (location->iterators) - private List states; - - // the provider; where we get all our information - private final ShardDataProvider provider; - - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(WindowedData.class); - - /** - * create a WindowedData given a shard provider - * - * @param provider the ShardDataProvider - */ - public WindowedData(ShardDataProvider provider) { - this.provider = provider; - } - - /** - * load the states dynamically, since the only way to get a genome loc is from the read (the shard doesn't have one) - * - * @param provider the ShardDataProvider - * @param rec the current read - */ - private void getStates(ShardDataProvider provider, SAMRecord rec) { - - int stop = Integer.MAX_VALUE; - // figure out the appropriate alignment stop - if (provider.hasReference()) { - stop = provider.getReference().getSequenceDictionary().getSequence(rec.getReferenceIndex()).getSequenceLength(); - } - - // calculate the range of positions we need to look at - GenomeLoc range = provider.getGenomeLocParser().createGenomeLoc(rec.getReferenceName(), - rec.getAlignmentStart(), - stop); - states = new ArrayList(); - if (provider.getReferenceOrderedData() != null) - for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) - states.add(new RMDDataState(dataSource, dataSource.seek(range))); - } - - /** - * this function is for testing only - * - * @param states a list of RMDDataState to initialize with - */ - WindowedData(List states) { - this.states = states; - provider = null; - } - /** * create a ReadMetaDataTracker given the current read * @@ -140,60 +95,65 @@ class WindowedData { * * @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments */ - public ReadMetaDataTracker getTracker(SAMRecord rec) { - updatePosition(rec); - return new ReadMetaDataTracker(provider.getGenomeLocParser(), rec, mapping); + @Requires("rec != null") + @Ensures("result != null") + public ReadMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { + if ( rec.getReadUnmappedFlag() ) + // empty RODs for unmapped reads + return new ReadMetaDataTracker(); + else + return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec)); } - /** - * update the position we're storing - * - * @param rec the read to use for start and end - */ - private void updatePosition(SAMRecord rec) { - if (states == null) getStates(this.provider, rec); - currentLoc = provider.getGenomeLocParser().createGenomeLoc(rec); - - // flush the queue looking for records we've passed over - while (mapping.size() > 0 && mapping.firstKey() < currentLoc.getStart()) - mapping.pollFirstEntry(); // toss away records that we've passed - - // add new data to the queue - for (RMDDataState state : states) { - // move into position - while (state.iterator.hasNext() && state.iterator.peekNextLocation().isBefore(currentLoc)) - state.iterator.next(); - while (state.iterator.hasNext() && state.iterator.peekNextLocation().overlapsP(currentLoc)) { - RODRecordList list = state.iterator.next(); - for (GATKFeature datum : list) { - if (!mapping.containsKey(list.getLocation().getStart())) - mapping.put(list.getLocation().getStart(), new RODMetaDataContainer()); - mapping.get(list.getLocation().getStart()).addEntry(datum); - } - } + @Requires({"interval != null", "shardSpan.containsP(interval)"}) + @Ensures("result != null") + public ReadMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { + if ( states.isEmpty() ) // optimization for no bindings (common for read walkers) + return EMPTY_TRACKER; + else { + final List bindings = new ArrayList(states.size()); + for ( final RMDDataState state : states ) + bindings.add(state.stream.getOverlapping(interval)); + return new ReadMetaDataTracker(bindings); } } - /** Closes the current view. */ + /** + * Closes the current view. + */ public void close() { - if (states == null) return; - for (RMDDataState state : states) - state.dataSource.close( state.iterator ); + for (final RMDDataState state : states) + state.close(); // Clear out the existing data so that post-close() accesses to this data will fail-fast. - states = null; + states.clear(); } + /** Models the traversal state of a given ROD lane. */ + private static class RMDDataState { + public final ReferenceOrderedDataSource dataSource; + public final IntervalOverlappingRODsFromStream stream; + private final LocationAwareSeekableRODIterator iterator; -} + public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { + this.dataSource = dataSource; + this.iterator = iterator; + this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator(iterator)); + } -/** Models the traversal state of a given ROD lane. */ -class RMDDataState { - public final ReferenceOrderedDataSource dataSource; - public final LocationAwareSeekableRODIterator iterator; + /** + * For testing + */ + public RMDDataState(final String name, final PeekableIterator iterator) { + this.dataSource = null; + this.iterator = null; + this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator(iterator)); + } - public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { - this.dataSource = dataSource; - this.iterator = iterator; + public void close() { + if ( dataSource != null ) + dataSource.close( iterator ); + } } } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java index 96dbd15f2..cfea5901e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.refdata; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -39,141 +40,12 @@ import java.util.*; *

* a read-based meta data tracker */ -public class ReadMetaDataTracker { - /** - * The parser, used to create new GenomeLocs. - */ - private final GenomeLocParser genomeLocParser; - - private final SAMRecord record; - - // the buffer of positions and RODs we've stored - private final TreeMap mapping; - - /** - * create a read meta data tracker, given the read and a queue of RODatum positions - * - * @param record the read to create offset from - * @param mapping the mapping of reference ordered datum - */ - public ReadMetaDataTracker(GenomeLocParser genomeLocParser, SAMRecord record, TreeMap mapping) { - this.genomeLocParser = genomeLocParser; - this.record = record; - this.mapping = mapping; +public class ReadMetaDataTracker extends RefMetaDataTracker { + public ReadMetaDataTracker() { + super(); } - /** - * create an alignment of read position to reference ordered datum - * - * @param record the SAMRecord - * @param queue the queue (as a tree set) - * @param cl the class name, null if not filtered by classname - * @param name the datum track name, null if not filtered by name - * - * @return a mapping from the position in the read to the reference ordered datum - */ - private Map> createReadAlignment(SAMRecord record, TreeMap queue, Class cl, String name) { - if (name != null && cl != null) throw new IllegalStateException("Both a class and name cannot be specified"); - Map> ret = new LinkedHashMap>(); - GenomeLoc location = genomeLocParser.createGenomeLoc(record); - int length = record.getReadLength(); - for (Integer loc : queue.keySet()) { - Integer position = loc - location.getStart(); - if (position >= 0 && position < length) { - Collection set; - if (cl != null) - set = queue.get(loc).getSet(cl); - else - set = queue.get(loc).getSet(name); - if (set != null && set.size() > 0) - ret.put(position, set); - } - } - return ret; - - } - - /** - * create an alignment of read position to reference ordered datum - * - * @return a mapping from the position in the read to the reference ordered datum - */ - private Map> createGenomeLocAlignment(SAMRecord record, TreeMap mapping, Class cl, String name) { - Map> ret = new LinkedHashMap>(); - int start = record.getAlignmentStart(); - int stop = record.getAlignmentEnd(); - for (Integer location : mapping.keySet()) { - if (location >= start && location <= stop) - if (cl != null) - ret.put(location, mapping.get(location).getSet(cl)); - else - ret.put(location, mapping.get(location).getSet(name)); - } - return ret; - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping() { - return createReadAlignment(record, mapping, null, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping() { - return createGenomeLocAlignment(record, mapping, null, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping(String name) { - return createReadAlignment(record, mapping, null, name); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping(String name) { - return createGenomeLocAlignment(record, mapping, null, name); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping(Class cl) { - return createReadAlignment(record, mapping, cl, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping(Class cl) { - return createGenomeLocAlignment(record, mapping, cl, null); - } - - /** - * get the list of all the RODS overlapping this read, without any information about their position - * @return a Collection (no order guaranteed), of all the RODs covering this read - */ - public List getAllCoveringRods() { - List ret = new ArrayList(); - for (Map.Entry entry : mapping.entrySet()) - ret.addAll(entry.getValue().getSet()); - return ret; + public ReadMetaDataTracker(Collection allBindings) { + super(allBindings); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index d61b9e9b6..e6eddc0b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -620,16 +620,11 @@ public class IndelRealigner extends ReadWalker { } private void populateKnownIndels(ReadMetaDataTracker metaDataTracker, ReferenceContext ref) { - for ( Collection rods : metaDataTracker.getContigOffsetMapping().values() ) { - Iterator rodIter = rods.iterator(); - while ( rodIter.hasNext() ) { - Object rod = rodIter.next().getUnderlyingObject(); - if ( indelRodsSeen.contains(rod) ) - continue; - indelRodsSeen.add(rod); - if ( rod instanceof VariantContext ) - knownIndelsToTry.add((VariantContext)rod); - } + for ( final VariantContext vc : metaDataTracker.getValues(known) ) { + if ( indelRodsSeen.contains(vc) ) + continue; + indelRodsSeen.add(vc); + knownIndelsToTry.add(vc); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index f8faa101b..0b35dd599 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -125,6 +125,15 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ! discontinuousP( that ); } + /** + * Return true if this GenomeLoc represents the UNMAPPED location + * @return + */ + public final boolean isUnmapped() { + return isUnmapped(this); + } + + /** * Returns a new GenomeLoc that represents the entire span of this and that. Requires that * this and that GenomeLoc are contiguous and both mapped diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index 41bdda0e0..ff8952dfa 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -1,207 +1,347 @@ /* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2010. The Broad Institute +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +* OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.gatk.datasources.providers; +import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import org.testng.Assert; +import org.broad.tribble.BasicFeature; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTrackerUnitTest; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; - -import org.testng.annotations.BeforeMethod; - +import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; - /** - * @author aaron - *

- * Class ReadBasedReferenceOrderedViewUnitTest - *

- * test out the ReadBasedReferenceOrderedView class + * @author depristo */ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { - private GenomeLocParser genomeLocParser; - private static int startingChr = 1; private static int endingChr = 2; private static int readCount = 100; private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; + private static String contig; private static SAMFileHeader header; + private GenomeLocParser genomeLocParser; + @BeforeClass public void beforeClass() { header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + contig = header.getSequence(0).getSequenceName(); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + + initializeTests(); } - @BeforeMethod - public void beforeEach() { - } - - @Test - public void testCreateReadMetaDataTrackerOnePerSite() { - // make ten reads, - List records = new ArrayList(); - for (int x = 1; x < 11; x++) { - SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "name", 0, x, 10); + private class CompareFeatures implements Comparator { + @Override + public int compare(Feature o1, Feature o2) { + return genomeLocParser.createGenomeLoc(o1).compareTo(genomeLocParser.createGenomeLoc(o2)); } - GenomeLoc start = genomeLocParser.createGenomeLoc(header.getSequenceDictionary().getSequence(0).getSequenceName(), 0, 0); - List list = new ArrayList(); - list.add(new RMDDataState(null, new FakePeekingRODIterator(genomeLocParser,start, "fakeName"))); - ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(new WindowedData(list)); + } - for (SAMRecord rec : records) { - ReadMetaDataTracker tracker = view.getReferenceOrderedDataForRead(rec); - Map> map = tracker.getReadOffsetMapping(); - for (Integer i : map.keySet()) { - Assert.assertEquals(map.get(i).size(), 1); + private class ReadMetaDataTrackerRODStreamTest extends TestDataProvider { + final List allFeatures; + final List intervals; + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final GenomeLoc interval) { + this(allFeatures, Collections.singletonList(interval)); + } + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final List intervals) { + super(ReadMetaDataTrackerRODStreamTest.class); + this.allFeatures = new ArrayList(allFeatures); + Collections.sort(this.allFeatures, new CompareFeatures()); + this.intervals = new ArrayList(intervals); + Collections.sort(this.intervals); + setName(String.format("%s nFeatures %d intervals %s", getClass().getSimpleName(), allFeatures.size(), + intervals.size() == 1 ? intervals.get(0) : "size " + intervals.size())); + } + + public PeekableIterator getIterator(final String name) { + return new PeekableIterator(new TribbleIteratorFromCollection(name, genomeLocParser, allFeatures)); + } + + public Set getExpectedOverlaps(final GenomeLoc interval) { + final Set overlapping = new HashSet(); + for ( final Feature f : allFeatures ) + if ( genomeLocParser.createGenomeLoc(f).overlapsP(interval) ) + overlapping.add(f); + return overlapping; + } + } + + public void initializeTests() { + final List handPickedFeatures = new ArrayList(); + + handPickedFeatures.add(new BasicFeature(contig, 1, 1)); + handPickedFeatures.add(new BasicFeature(contig, 2, 5)); + handPickedFeatures.add(new BasicFeature(contig, 4, 4)); + handPickedFeatures.add(new BasicFeature(contig, 6, 6)); + handPickedFeatures.add(new BasicFeature(contig, 9, 10)); + handPickedFeatures.add(new BasicFeature(contig, 10, 10)); + handPickedFeatures.add(new BasicFeature(contig, 10, 11)); + handPickedFeatures.add(new BasicFeature(contig, 13, 20)); + + createTestsForFeatures(handPickedFeatures); + + // test in the present of a large spanning element + { + List oneLargeSpan = new ArrayList(handPickedFeatures); + oneLargeSpan.add(new BasicFeature(contig, 1, 100)); + createTestsForFeatures(oneLargeSpan); + } + + // test in the presence of a partially spanning element + { + List partialSpanStart = new ArrayList(handPickedFeatures); + partialSpanStart.add(new BasicFeature(contig, 1, 6)); + createTestsForFeatures(partialSpanStart); + } + + // test in the presence of a partially spanning element at the end + { + List partialSpanEnd = new ArrayList(handPickedFeatures); + partialSpanEnd.add(new BasicFeature(contig, 10, 100)); + createTestsForFeatures(partialSpanEnd); + } + + // no data at all + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, 5, 5); + new ReadMetaDataTrackerRODStreamTest(Collections.emptyList(), loc); + } + + // -------------------------------------------------------------------------------- + // + // tests for the lower level IntervalOverlappingRODsFromStream + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerRODStreamTest") + public Object[][] createReadMetaDataTrackerRODStreamTest() { + return ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + } + + private GenomeLoc span(final List features) { + int featuresStart = 1; for ( final GenomeLoc f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final GenomeLoc f : features ) featuresStop = Math.max(featuresStop, f.getStop()); + return genomeLocParser.createGenomeLoc(contig, featuresStart, featuresStop); + } + + private void createTestsForFeatures(final List features) { + int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); + + for ( final int size : Arrays.asList(1, 5, 10, 100, 1000) ) { + final List allIntervals = new ArrayList(); + // regularly spaced + for ( int start = featuresStart; start < featuresStop; start++) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1); + allIntervals.add(loc); + new ReadMetaDataTrackerRODStreamTest(features, loc); } - Assert.assertEquals(map.keySet().size(), 10); + + // starting and stopping at every feature + for ( final Feature f : features ) { + // just at the feature + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart(), f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // up to end + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // missing by 1 + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() + 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // just spanning + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + } + + new ReadMetaDataTrackerRODStreamTest(features, allIntervals); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest") + public void runReadMetaDataTrackerRODStreamTest_singleQuery(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() == 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, Collections.singletonList(data.intervals.get(0))); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_singleQuery") + public void runReadMetaDataTrackerRODStreamTest_multipleQueries(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() > 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, data.intervals); + } + } + + private void testRODStream(final ReadMetaDataTrackerRODStreamTest test, final IntervalOverlappingRODsFromStream stream, final List intervals) { + for ( final GenomeLoc interval : intervals ) { + final RODRecordList query = stream.getOverlapping(interval); + final HashSet queryFeatures = new HashSet(); + for ( final GATKFeature f : query ) queryFeatures.add((Feature)f.getUnderlyingObject()); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + + // -------------------------------------------------------------------------------- + // + // tests for the higher level tracker itself + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerTests") + public Object[][] createTrackerTests() { + List tests = new ArrayList(); + + final Object[][] singleTests = ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + final List multiSiteTests = new ArrayList(); + for ( final Object[] singleTest : singleTests ) { + if ( ((ReadMetaDataTrackerRODStreamTest)singleTest[0]).intervals.size() > 1 ) + multiSiteTests.add((ReadMetaDataTrackerRODStreamTest)singleTest[0]); } + // all pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { + tests.add(new Object[]{singleTest}); + } + + // all 3 way pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + tests.add(new Object[]{singleTest}); + } + + return tests.toArray(new Object[][]{}); } -} + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerTests", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_multipleQueries") + public void runReadMetaDataTrackerTest(final List RODs) { + final List names = new ArrayList(); + final List> iterators = new ArrayList>(); + final List intervals = new ArrayList(); + final List> rodBindings = new ArrayList>(); + for ( int i = 0; i < RODs.size(); i++ ) { + final RodBinding rodBinding = new RodBinding(Feature.class, "name"+i); + rodBindings.add(rodBinding); + final String name = rodBinding.getName(); + names.add(name); + iterators.add(RODs.get(i).getIterator(name)); + intervals.addAll(RODs.get(i).intervals); + } -class FakePeekingRODIterator implements LocationAwareSeekableRODIterator { - private GenomeLocParser genomeLocParser; + Collections.sort(intervals); + final GenomeLoc span = span(intervals); + final ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(genomeLocParser, span, names, iterators); - // current location - private GenomeLoc location; - private GATKFeature curROD; - private final String name; + for ( final GenomeLoc interval : intervals ) { + final ReadMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); - public FakePeekingRODIterator(GenomeLocParser genomeLocParser, GenomeLoc startingLoc, String name) { - this.name = name; - this.location = genomeLocParser.createGenomeLoc(startingLoc.getContig(), startingLoc.getStart() + 1, startingLoc.getStop() + 1); + for ( int i = 0; i < RODs.size(); i++ ) { + final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); + final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); + final Set queryFeatures = new HashSet(queryFeaturesList); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } } /** - * Gets the header associated with the backing input stream. - * @return the ROD header. + * Created with IntelliJ IDEA. + * User: depristo + * Date: 8/29/12 + * Time: 1:19 PM + * To change this template use File | Settings | File Templates. */ - @Override - public Object getHeader() { - return null; - } + static class TribbleIteratorFromCollection implements Iterator { + // current location + private final String name; + final Queue gatkFeatures; - /** - * Gets the sequence dictionary associated with the backing input stream. - * @return sequence dictionary from the ROD header. - */ - @Override - public SAMSequenceDictionary getSequenceDictionary() { - return null; - } + public TribbleIteratorFromCollection(final String name, final GenomeLocParser genomeLocParser, final List features) { + this.name = name; + this.gatkFeatures = new LinkedList(); + for ( final Feature f : features ) + gatkFeatures.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); + } - @Override - public GenomeLoc peekNextLocation() { - System.err.println("Peek Next -> " + location); - return location; - } + @Override + public boolean hasNext() { + return ! gatkFeatures.isEmpty(); + } - @Override - public GenomeLoc position() { - return location; - } + @Override + public RODRecordList next() { + final GATKFeature first = gatkFeatures.poll(); + final Collection myFeatures = new LinkedList(); + myFeatures.add(first); + while ( gatkFeatures.peek() != null && gatkFeatures.peek().getLocation().getStart() == first.getStart() ) + myFeatures.add(gatkFeatures.poll()); - @Override - public RODRecordList seekForward(GenomeLoc interval) { - while (location.isBefore(interval)) - next(); - return next(); // we always move by one, we know the next location will be right - } + GenomeLoc loc = first.getLocation(); + for ( final GATKFeature feature : myFeatures ) + loc = loc.merge(feature.getLocation()); - @Override - public boolean hasNext() { - return true; // we always have next - } + return new RODRecordListImpl(name, myFeatures, loc); // is this safe? + } - @Override - public RODRecordList next() { - System.err.println("Next -> " + location); - curROD = new ReadMetaDataTrackerUnitTest.FakeRODatum(location, name); - location = genomeLocParser.createGenomeLoc(location.getContig(), location.getStart() + 1, location.getStop() + 1); - FakeRODRecordList list = new FakeRODRecordList(); - list.add(curROD); - return list; - } - - @Override - public void remove() { - throw new IllegalStateException("GRRR"); - } - - @Override - public void close() { - // nothing to do + @Override public void remove() { throw new IllegalStateException("GRRR"); } } } -class FakeRODRecordList extends AbstractList implements RODRecordList { - private final List list = new ArrayList(); - public boolean add(GATKFeature data) { - return list.add(data); - } - - @Override - public GATKFeature get(int i) { - return list.get(i); - } - - @Override - public int size() { - return list.size(); - } - - @Override - public GenomeLoc getLocation() { - return list.get(0).getLocation(); - } - - @Override - public String getName() { - return "test"; - } - - @Override - public int compareTo(RODRecordList rodRecordList) { - return this.list.get(0).getLocation().compareTo(rodRecordList.getLocation()); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java deleted file mode 100644 index 2198c461d..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; - -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.*; - - -/** - * @author aaron - *

- * Class ReadMetaDataTrackerUnitTest - *

- * test out the ReadMetaDataTracker - */ -public class ReadMetaDataTrackerUnitTest extends BaseTest { - private static int startingChr = 1; - private static int endingChr = 2; - private static int readCount = 100; - private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; - private static SAMFileHeader header; - private Set nameSet; - - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void beforeClass() { - header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - } - - @BeforeMethod - public void beforeEach() { - nameSet = new TreeSet(); - nameSet.add("default"); - } - - @Test - public void twoRodsAtEachReadBase() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 2); - } - Assert.assertEquals(count, 10); - } - - @Test - public void rodAtEachReadBase() { - - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void filterByName() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping("default"); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void filterByDupType() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, false); // create both RODs of the same type - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(FakeRODatum.class); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 2); - } - Assert.assertEquals(count, 10); - } - - // @Test this test can be uncommented to determine the speed impacts of any changes to the RODs for reads system - - public void filterByMassiveDupType() { - - for (int y = 0; y < 20; y++) { - nameSet.add("default" + String.valueOf(y)); - long firstTime = System.currentTimeMillis(); - for (int lp = 0; lp < 1000; lp++) { - ReadMetaDataTracker tracker = getRMDT(1, nameSet, false); // create both RODs of the same type - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(FakeRODatum.class); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), y + 2); - } - Assert.assertEquals(count, 10); - } - System.err.println(y + " = " + (System.currentTimeMillis() - firstTime)); - } - } - - - @Test - public void filterByType() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(Fake2RODatum.class); - for (int x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void sparceRODsForRead() { - ReadMetaDataTracker tracker = getRMDT(7, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 2); - } - - @Test - public void rodByGenomeLoc() { - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getContigOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getContigOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - - /** - * create a ReadMetaDataTracker given: - * - * @param incr the spacing between site locations - * @param names the names of the reference ordered data to create: one will be created at every location for each name - * - * @return a ReadMetaDataTracker - */ - private ReadMetaDataTracker getRMDT(int incr, Set names, boolean alternateTypes) { - SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "name", 0, 1, 10); - TreeMap data = new TreeMap(); - for (int x = 0; x < record.getAlignmentEnd(); x += incr) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getReferenceName(), record.getAlignmentStart() + x, record.getAlignmentStart() + x); - RODMetaDataContainer set = new RODMetaDataContainer(); - - int cnt = 0; - for (String name : names) { - if (alternateTypes) - set.addEntry((cnt % 2 == 0) ? new FakeRODatum(loc, name) : new Fake2RODatum(loc, name)); - else - set.addEntry(new FakeRODatum(loc, name)); - cnt++; - } - data.put(record.getAlignmentStart() + x, set); - } - ReadMetaDataTracker tracker = new ReadMetaDataTracker(genomeLocParser, record, data); - return tracker; - } - - - /** for testing, we want a fake rod with a different classname, for the get-by-class-name functions */ - static public class Fake2RODatum extends FakeRODatum { - - public Fake2RODatum(GenomeLoc location, String name) { - super(location, name); - } - } - - - /** for testing only */ - static public class FakeRODatum extends GATKFeature { - - final GenomeLoc location; - final String name; - - public FakeRODatum(GenomeLoc location, String name) { - super(name); - this.location = location; - this.name = name; - } - - @Override - public String getName() { - return name; - } - - @Override - public GenomeLoc getLocation() { - return this.location; - } - - @Override - public Object getUnderlyingObject() { - return null; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - public String getChr() { - return location.getContig(); - } - - @Override - public int getStart() { - return (int)this.location.getStart(); - } - - @Override - public int getEnd() { - return (int)this.location.getStop(); - } - } -} From 1200848bbfb7069f898e1933ed687a0e18f56e0a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 17:39:32 -0400 Subject: [PATCH 10/48] Part II of GSA-462: Consistent RODBinding access across Ref and Read trackers -- Deleted ReadMetaDataTracker -- Added function to ReadShard to give us the span from the left most position of the reads in the shard to the right most, which is needed for the new view --- .../compression/reducereads/ReduceReads.java | 4 +- .../haplotypecaller/HaplotypeCaller.java | 2 +- .../sting/alignment/AlignmentValidation.java | 4 +- .../ReadBasedReferenceOrderedView.java | 21 ++++---- .../gatk/datasources/reads/ReadShard.java | 31 +++++++++-- .../gatk/refdata/ReadMetaDataTracker.java | 51 ------------------- .../sting/gatk/traversals/TraverseReads.java | 4 +- .../gatk/traversals/TraverseReadsNano.java | 5 +- .../sting/gatk/walkers/ClipReads.java | 4 +- .../sting/gatk/walkers/FlagStat.java | 4 +- .../sting/gatk/walkers/PrintReads.java | 4 +- .../sting/gatk/walkers/ReadWalker.java | 5 +- .../sting/gatk/walkers/SplitSamFile.java | 4 +- .../diagnostics/ReadGroupProperties.java | 4 +- .../diagnostics/ReadLengthDistribution.java | 4 +- .../gatk/walkers/indels/IndelRealigner.java | 9 ++-- .../gatk/walkers/indels/LeftAlignIndels.java | 5 +- .../walkers/indels/SomaticIndelDetector.java | 4 +- .../sting/gatk/walkers/qc/CountBases.java | 4 +- .../sting/gatk/walkers/qc/CountMales.java | 4 +- .../gatk/walkers/qc/CountReadEvents.java | 4 +- .../sting/gatk/walkers/qc/CountReads.java | 4 +- .../gatk/walkers/qc/CountTerminusEvent.java | 4 +- .../gatk/walkers/qc/ReadClippingStats.java | 4 +- ...ReadBasedReferenceOrderedViewUnitTest.java | 4 +- .../reads/GATKWalkerBenchmark.java | 4 +- 26 files changed, 85 insertions(+), 116 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 177050667..d1ec9c474 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.ReadFilters; @@ -247,7 +247,7 @@ public class ReduceReads extends ReadWalker, ReduceRea * @return a linked list with all the reads produced by the clipping operations */ @Override - public LinkedList map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public LinkedList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { LinkedList mappedReads; totalReads++; if (!debugRead.isEmpty() && read.getReadName().contains(debugRead)) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 845fc68a6..3d41b7233 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -308,7 +308,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) { if( !allelesToGenotype.contains(vc) ) { - allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object + allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object } } if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java index e8eea5ff0..b903b9f7d 100644 --- a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java +++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -81,7 +81,7 @@ public class AlignmentValidation extends ReadWalker { * @return Number of reads aligned by this map (aka 1). */ @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { //logger.info(String.format("examining read %s", read.getReadName())); byte[] bases = read.getReadBases(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java index 054758101..adf1b34df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java @@ -27,8 +27,9 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; @@ -38,11 +39,11 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; -/** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */ +/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */ public class ReadBasedReferenceOrderedView implements View { // a list of the RMDDataState (location->iterators) private final List states = new ArrayList(1); - private final static ReadMetaDataTracker EMPTY_TRACKER = new ReadMetaDataTracker(); + private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); /** * Used to get genome locs for reads @@ -56,7 +57,7 @@ public class ReadBasedReferenceOrderedView implements View { private final GenomeLoc shardSpan; public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { - this(provider.getGenomeLocParser(), provider.getShard().getLocation()); + this(provider.getGenomeLocParser(), ((ReadShard)provider.getShard()).getReadsSpan()); provider.register(this); if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) { @@ -89,32 +90,32 @@ public class ReadBasedReferenceOrderedView implements View { } /** - * create a ReadMetaDataTracker given the current read + * create a RefMetaDataTracker given the current read * * @param rec the read * - * @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments + * @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments */ @Requires("rec != null") @Ensures("result != null") - public ReadMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { + public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { if ( rec.getReadUnmappedFlag() ) // empty RODs for unmapped reads - return new ReadMetaDataTracker(); + return new RefMetaDataTracker(); else return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec)); } @Requires({"interval != null", "shardSpan.containsP(interval)"}) @Ensures("result != null") - public ReadMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { + public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { if ( states.isEmpty() ) // optimization for no bindings (common for read walkers) return EMPTY_TRACKER; else { final List bindings = new ArrayList(states.size()); for ( final RMDDataState state : states ) bindings.add(state.stream.getOverlapping(interval)); - return new ReadMetaDataTracker(bindings); + return new RefMetaDataTracker(bindings); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index f5a4cb4cf..9e1c12186 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -7,10 +7,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @@ -125,4 +122,30 @@ public class ReadShard extends Shard { } return sb.toString(); } + + /** + * Get the full span from the start of the left most read to the end of the right most one + * + * Note this may be different than the getLocation() of the shard, as this reflects the + * targeted span, not the actual span of reads + * + * @return the genome loc representing the span of these reads on the genome + */ + public GenomeLoc getReadsSpan() { + if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() ) + return super.getLocation(); + else { + int start = Integer.MAX_VALUE; + int stop = Integer.MIN_VALUE; + String contig = null; + + for ( final SAMRecord read : reads ) { + contig = read.getReferenceName(); + if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + } + + return parser.createGenomeLoc(contig, start, stop); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java deleted file mode 100644 index cfea5901e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.util.*; - - -/** - * @author aaron - *

- * Class ReadMetaDataTracker - *

- * a read-based meta data tracker - */ -public class ReadMetaDataTracker extends RefMetaDataTracker { - public ReadMetaDataTracker() { - super(); - } - - public ReadMetaDataTracker(Collection allBindings) { - super(allBindings); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index 2dc0444b2..3b712c973 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrd import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadView; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -91,7 +91,7 @@ public class TraverseReads extends TraversalEngine,Read dataProvider.getShard().getReadMetrics().incrementNumIterations(); // if the read is mapped, create a metadata tracker - final ReadMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); if (keepMeP) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 4215230b8..081c6b8fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -29,9 +29,8 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; @@ -142,7 +141,7 @@ public class TraverseReadsNano extends TraversalEngine, //dataProvider.getShard().getReadMetrics().incrementNumIterations(); // if the read is mapped, create a metadata tracker - final ReadMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); if (keepMeP) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReads.java index 4eaa16692..e63dbcabd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReads.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ClippingOp; @@ -299,7 +299,7 @@ public class ClipReads extends ReadWalker impleme * @param read the read itself, as a GATKSAMRecord * @return the read itself */ - public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker ) { return simplifyReads ? read.simplify() : read; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index 77e3af93f..42fbb32bd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -1,8 +1,7 @@ package org.broadinstitute.sting.gatk.walkers; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** @@ -27,5 +26,5 @@ public abstract class ReadWalker extends Walker { } @Override - public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) { + public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, RefMetaDataTracker RefMetaDataTracker) { final String rgID = read.getReadGroup().getId(); final PerReadGroupInfo info = readGroupInfo.get(rgID); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java index 1dc8a7ec1..2b84cccc9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java @@ -4,7 +4,7 @@ import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -74,7 +74,7 @@ public class ReadLengthDistribution extends ReadWalker { } @Override - public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, ReadMetaDataTracker readMetaDataTracker) { + public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, RefMetaDataTracker RefMetaDataTracker) { GATKReportTable table = report.getTable("ReadLengthDistribution"); int length = Math.abs(samRecord.getReadLength()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index e6eddc0b7..d9b71f938 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -36,8 +36,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.*; @@ -473,7 +472,7 @@ public class IndelRealigner extends ReadWalker { readsActuallyCleaned.clear(); } - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { if ( currentInterval == null ) { emit(read); return 0; @@ -540,7 +539,7 @@ public class IndelRealigner extends ReadWalker { // TODO -- it would be nice if we could use indels from 454/Ion reads as alternate consenses } - private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker, GenomeLoc readLoc) { + private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker, GenomeLoc readLoc) { if ( readsToClean.size() > 0 ) { GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); if ( manager.canMoveReads(earliestPossibleMove) ) @@ -619,7 +618,7 @@ public class IndelRealigner extends ReadWalker { } } - private void populateKnownIndels(ReadMetaDataTracker metaDataTracker, ReferenceContext ref) { + private void populateKnownIndels(RefMetaDataTracker metaDataTracker, ReferenceContext ref) { for ( final VariantContext vc : metaDataTracker.getValues(known) ) { if ( indelRodsSeen.contains(vc) ) continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index 6b9bd04d2..21b3b71d8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -27,12 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.Cigar; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.AlignmentUtils; @@ -80,7 +79,7 @@ public class LeftAlignIndels extends ReadWalker { writer.addAlignment(read); } - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { // we can not deal with screwy records if ( read.getReadUnmappedFlag() || read.getCigar().numCigarElements() == 0 ) { emit(read); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java index 3965a63fb..7c73f59e9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java @@ -39,7 +39,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; @@ -477,7 +477,7 @@ public class SomaticIndelDetector extends ReadWalker { @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java index 0c323934e..9954a25e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountBases extends ReadWalker { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return read.getReadLength(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java index bc178119d..f2e4cf1ad 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.samples.Gender; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.DataSource; @@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountMales extends ReadWalker { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { Sample sample = getSampleDB().getSample(read); return sample.getGender() == Gender.MALE ? 1 : 0; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java index 80845c447..80afd19fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java @@ -4,7 +4,7 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -47,7 +47,7 @@ public class CountReadEvents extends ReadWalker> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Map> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return ReadUtils.getCigarOperatorForAllBases(read); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index d33db2925..72bda03e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -42,7 +42,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReads extends ReadWalker implements TreeReducible { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java index 971b5bb85..09d239126 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java @@ -4,7 +4,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -41,7 +41,7 @@ import java.util.List; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountTerminusEvent extends ReadWalker, Pair> { - public Pair map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Pair map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { List cigarElements = read.getCigar().getCigarElements(); CigarElement lastElement = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java index 16d614afc..ec4f081a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java @@ -29,7 +29,7 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -75,7 +75,7 @@ public class ReadClippingStats extends ReadWalker { private long Gs; private long Ts; - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { for(byte base: read.getReadBases()) { switch(base) { case 'A': As++; break; From 53376b94236066e21c575c54d349a1f965e6eba9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 17:44:35 -0400 Subject: [PATCH 11/48] Part III of GSA-462: Consistent RODBinding access across Ref and Read trackers -- shardSpan is only calculated when there some ROD is live in the GATK. No sense in paying the cost per read when you don't need it -- Update contract to allow null span or unmapped span (good catch unittests!) --- .../providers/ReadBasedReferenceOrderedView.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java index adf1b34df..40fe03f4a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java @@ -57,7 +57,9 @@ public class ReadBasedReferenceOrderedView implements View { private final GenomeLoc shardSpan; public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { - this(provider.getGenomeLocParser(), ((ReadShard)provider.getShard()).getReadsSpan()); + this.genomeLocParser = provider.getGenomeLocParser(); + // conditional to optimize the case where we don't have any ROD data + this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null; provider.register(this); if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) { @@ -66,10 +68,6 @@ public class ReadBasedReferenceOrderedView implements View { } } - private ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser, final GenomeLoc shardSpan) { - this.genomeLocParser = genomeLocParser; - this.shardSpan = shardSpan; - } /** * Testing constructor @@ -78,7 +76,8 @@ public class ReadBasedReferenceOrderedView implements View { final GenomeLoc shardSpan, final List names, final List> featureSources) { - this(genomeLocParser, shardSpan); + this.genomeLocParser = genomeLocParser; + this.shardSpan = shardSpan; for ( int i = 0; i < names.size(); i++ ) states.add(new RMDDataState(names.get(i), featureSources.get(i))); } @@ -106,10 +105,10 @@ public class ReadBasedReferenceOrderedView implements View { return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec)); } - @Requires({"interval != null", "shardSpan.containsP(interval)"}) + @Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"}) @Ensures("result != null") public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { - if ( states.isEmpty() ) // optimization for no bindings (common for read walkers) + if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers) return EMPTY_TRACKER; else { final List bindings = new ArrayList(states.size()); From ce3d1f89ea6c79f3765e6174476f6031474dc60a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 18:43:02 -0400 Subject: [PATCH 13/48] ReadShard are no longer allowed to span multiple contigs -- Previous behavior was unnecessary and causes all sorts of problems with RODs for reads. The old implementation simply failed in this case. The new code handles this correctly by forcing shards to have all of their data on a single contig. -- Added a PrintReads integration test to ensure this behavior is correct -- Adding test BAMs that have < 200 reads and span across contig boundaries --- .../sting/gatk/datasources/reads/ReadShard.java | 4 ++++ .../sting/gatk/datasources/reads/SAMDataSource.java | 12 +++++++++--- .../gatk/walkers/PrintReadsIntegrationTest.java | 3 ++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 9e1c12186..fd1ee9859 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -140,6 +141,9 @@ public class ReadShard extends Shard { String contig = null; for ( final SAMRecord read : reads ) { + if ( contig != null && ! read.getReferenceName().equals(contig) ) + throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " + + "First contig is " + contig + " next read was " + read.getReferenceName() ); contig = read.getReferenceName(); if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 7f0a0c4c0..c8b654f81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -486,9 +486,15 @@ public class SAMDataSource { CloseableIterator iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate); while(!shard.isBufferFull() && iterator.hasNext()) { - read = iterator.next(); - shard.addRead(read); - noteFilePositionUpdate(positionUpdates,read); + final SAMRecord nextRead = iterator.next(); + if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) { + // only add reads to the shard if they are on the same contig + read = nextRead; + shard.addRead(read); + noteFilePositionUpdate(positionUpdates,read); + } else { + break; + } } // If the reads are sorted in queryname order, ensure that all reads diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java index 057cf1cf9..717d9d953 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java @@ -38,7 +38,8 @@ public class PrintReadsIntegrationTest extends WalkerTest { {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "6e920b8505e7e95d67634b0905237dbc")}, {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L unmapped", "13bb9a91b1d4dd2425f73302b8a1ac1c")}, {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "6e920b8505e7e95d67634b0905237dbc")}, - {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "6caec4f8a25befb6aba562955401af93")} + {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "6caec4f8a25befb6aba562955401af93")}, + {new PRTest(b37KGReference, "NA12878.1_10mb_2_10mb.bam", "", "c43380ac39b98853af457b90e52f8427")} }; } From 21dd70ed365ada928a5389db75b07966aa35202e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 10:10:00 -0400 Subject: [PATCH 14/48] Test to ensure that ReadBasedReferenceOrderedView produces stateless objects -- Stateless objects are required for nano-scheduling. This means you can take the RefMetaDataTracker provided by ReadBasedReferenceOrderedView, store it way, get another from the same view, and the original one behaves the same. --- ...ReadBasedReferenceOrderedViewUnitTest.java | 72 +++++++++++-------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index 6aa860a2e..d55c48054 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -249,21 +249,23 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { multiSiteTests.add((ReadMetaDataTrackerRODStreamTest)singleTest[0]); } - // all pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { - tests.add(new Object[]{singleTest}); - } + for ( final boolean testStateless : Arrays.asList(true, false) ) { + // all pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { + tests.add(new Object[]{singleTest, testStateless}); + } - // all 3 way pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { - tests.add(new Object[]{singleTest}); + // all 3 way pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + tests.add(new Object[]{singleTest, testStateless}); + } } return tests.toArray(new Object[][]{}); } @Test(enabled = true, dataProvider = "ReadMetaDataTrackerTests", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_multipleQueries") - public void runReadMetaDataTrackerTest(final List RODs) { + public void runReadMetaDataTrackerTest(final List RODs, final boolean testStateless) { final List names = new ArrayList(); final List> iterators = new ArrayList>(); final List intervals = new ArrayList(); @@ -282,31 +284,45 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { final GenomeLoc span = span(intervals); final ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(genomeLocParser, span, names, iterators); - for ( final GenomeLoc interval : intervals ) { - final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + if ( testStateless ) { + // test each tracker is well formed, as each is created + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + testMetaDataTrackerBindings(tracker, interval, RODs, rodBindings); + } + } else { + // tests all trackers are correct after reading them into an array + // this checks that the trackers are be safely stored away and analyzed later (critical for nano-scheduling) + final List trackers = new ArrayList(); + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + trackers.add(tracker); + } - for ( int i = 0; i < RODs.size(); i++ ) { - final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); - final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); - final Set queryFeatures = new HashSet(queryFeaturesList); - final Set overlaps = test.getExpectedOverlaps(interval); - - Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + - " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); - - BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + - " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + for ( int i = 0; i < trackers.size(); i++) { + testMetaDataTrackerBindings(trackers.get(i), intervals.get(i), RODs, rodBindings); } } } - /** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 8/29/12 - * Time: 1:19 PM - * To change this template use File | Settings | File Templates. - */ + private void testMetaDataTrackerBindings(final RefMetaDataTracker tracker, + final GenomeLoc interval, + final List RODs, + final List> rodBindings) { + for ( int i = 0; i < RODs.size(); i++ ) { + final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); + final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); + final Set queryFeatures = new HashSet(queryFeaturesList); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + static class TribbleIteratorFromCollection implements Iterator { // current location private final String name; From 792092b8917128868aedfbc4d5c86327dedb0371 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 10:39:16 -0400 Subject: [PATCH 16/48] ReadShards now default to 10K (up from 1K) reads per samFile up to 250K -- This should help make the inputs for parallel read walkers a little meater, and avoid spinning the shard creation infrastructure so often --- .../sting/gatk/datasources/reads/SAMDataSource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index c8b654f81..2b88775b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -262,7 +262,7 @@ public class SAMDataSource { else { // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. - ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000)); + ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000)); } resourcePool = new SAMResourcePool(Integer.MAX_VALUE); From 7b366d404900dd456ed271b96bfa03e0ef7b949d Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Aug 2012 11:01:01 -0400 Subject: [PATCH 17/48] misc cleanup in active region traversal. --- .../sting/gatk/traversals/TraverseActiveRegions.java | 12 ++++++------ .../sting/gatk/walkers/ActiveRegionWalker.java | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index af981e676..ecaa15fe9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -185,7 +185,7 @@ public class TraverseActiveRegions extends TraversalEngine walker ) { // Just want to output the active regions to a file, not actually process them - for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion : workQueue ) { + for( final ActiveRegion activeRegion : workQueue ) { if( activeRegion.isActive ) { walker.activeRegionOutStream.println( activeRegion.getLocation() ); } @@ -198,7 +198,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { + private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { final ArrayList placedReads = new ArrayList(); for( final GATKSAMRecord read : reads ) { final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); if( activeRegion.getLocation().overlapsP( readLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - org.broadinstitute.sting.utils.activeregion.ActiveRegion bestRegion = activeRegion; - for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion otherRegionToTest : workQueue ) { + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); bestRegion = otherRegionToTest; @@ -229,7 +229,7 @@ public class TraverseActiveRegions extends TraversalEngine extends Walker Date: Thu, 30 Aug 2012 15:07:02 -0400 Subject: [PATCH 18/48] Bugfix to compareTo and equals in GenomeLoc -- Yes, GenomeLoc.compareTo was broken. The compareTo function only considered the contig and start position, but not the stop, when comparing genome locs. -- Updated GenomeLoc.compareTo function to account for stop. Updated GATK code where necessary to fix resulting problems that depended on this. -- Added unit tests to ensure that hashcode, equals, and compareTo are all correct for GenomeLocs --- .../gatk/iterators/VerifyingSamIterator.java | 4 +- .../broadinstitute/sting/utils/GenomeLoc.java | 5 +- .../sting/utils/GenomeLocUnitTest.java | 56 +++++++++++++++++++ 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index f33dd414b..2763bca7c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -48,9 +48,7 @@ public class VerifyingSamIterator implements StingSAMIterator { if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); - GenomeLoc lastLoc = genomeLocParser.createGenomeLoc( last ); - GenomeLoc curLoc = genomeLocParser.createGenomeLoc( cur ); - return curLoc.compareTo(lastLoc) == -1; + return last.getAlignmentStart() > cur.getAlignmentStart(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 0b35dd599..6df9c9f1d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -427,7 +427,10 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome result = cmpContig; } else { if ( this.getStart() < that.getStart() ) result = -1; - if ( this.getStart() > that.getStart() ) result = 1; + else if ( this.getStart() > that.getStart() ) result = 1; + // these have the same start, so check the ends + else if ( this.getStop() < that.getStop() ) result = -1; + else if ( this.getStop() > that.getStop() ) result = 1; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index 49778a4d8..122e0265f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -211,4 +212,59 @@ public class GenomeLocUnitTest extends BaseTest { Assert.assertEquals(cfg.gl1.reciprocialOverlapFraction(cfg.gl2), cfg.overlapFraction); } } + + // ------------------------------------------------------------------------------------- + // + // testing comparison, hashcode, and equals + // + // ------------------------------------------------------------------------------------- + + @DataProvider(name = "GenomeLocComparisons") + public Object[][] createGenomeLocComparisons() { + List tests = new ArrayList(); + + final int start = 10; + for ( int stop = start; stop < start + 3; stop++ ) { + final GenomeLoc g1 = genomeLocParser.createGenomeLoc("chr2", start, stop); + for ( final String contig : Arrays.asList("chr1", "chr2", "chr3")) { + for ( int start2 = start - 1; start2 <= stop + 1; start2++ ) { + for ( int stop2 = start2; stop2 < stop + 2; stop2++ ) { + final GenomeLoc g2 = genomeLocParser.createGenomeLoc(contig, start2, stop2); + + ComparisonResult cmp = ComparisonResult.EQUALS; + if ( contig.equals("chr3") ) cmp = ComparisonResult.LESS_THAN; + else if ( contig.equals("chr1") ) cmp = ComparisonResult.GREATER_THAN; + else if ( start < start2 ) cmp = ComparisonResult.LESS_THAN; + else if ( start > start2 ) cmp = ComparisonResult.GREATER_THAN; + else if ( stop < stop2 ) cmp = ComparisonResult.LESS_THAN; + else if ( stop > stop2 ) cmp = ComparisonResult.GREATER_THAN; + + tests.add(new Object[]{g1, g2, cmp}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private enum ComparisonResult { + LESS_THAN(-1), + EQUALS(0), + GREATER_THAN(1); + + final int cmp; + + private ComparisonResult(int cmp) { + this.cmp = cmp; + } + } + + @Test(dataProvider = "GenomeLocComparisons") + public void testGenomeLocComparisons(GenomeLoc g1, GenomeLoc g2, ComparisonResult expected) { + Assert.assertEquals(g1.compareTo(g2), expected.cmp, "Comparing genome locs failed"); + Assert.assertEquals(g1.equals(g2), expected == ComparisonResult.EQUALS); + if ( expected == ComparisonResult.EQUALS ) + Assert.assertEquals(g1.hashCode(), g2.hashCode(), "Equal genome locs don't have the same hash code"); + } } From 72cf6bdd9f7d675797d0a76902907e3af05cea56 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:10:58 -0400 Subject: [PATCH 19/48] Fix GSA-529: Fix RODs for parallel read walkers -- TraverseReadsNano modified to read in all input data before invoking maps, so the input to TraverseReadsNano is a MapData object holding the sam record, the ref context, and the refmetadatatracker. -- Update ValidateRODForReads to be tree reducible, using synchronized map and explicitly sort the output map from locations -> counts in onTraversalDone -- Expanded integration tests to test nt 1, 2, 4. --- .../gatk/traversals/TraverseReadsNano.java | 91 +++++++++++-------- .../utils/nanoScheduler/NanoScheduler.java | 5 +- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 081c6b8fc..b397cb8c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -27,16 +27,21 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.ArrayList; +import java.util.List; + /** * @author aaron * @version 1.0 @@ -50,12 +55,13 @@ public class TraverseReadsNano extends TraversalEngine, /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; + private static final int MIN_GROUP_SIZE = 100; + final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - final int mapGroupSize = bufferSize / 10 + 1; - nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); + final int mapGroupSize = (int)Math.max(Math.ceil(bufferSize / 50.0 + 1), MIN_GROUP_SIZE); + nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); } @Override @@ -79,24 +85,42 @@ public class TraverseReadsNano extends TraversalEngine, if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - if ( dataProvider.hasReferenceOrderedData() ) - throw new ReviewedStingException("Parallel read walkers currently don't support access to reference ordered data"); - - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - nanoScheduler.setDebug(DEBUG); - final TraverseReadsMap myMap = new TraverseReadsMap(reads, reference, rodView, walker); + final TraverseReadsMap myMap = new TraverseReadsMap(walker); final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - T result = nanoScheduler.execute(reads.iterator().iterator(), myMap, sum, myReduce); + T result = nanoScheduler.execute(aggregateMapData(dataProvider).iterator(), myMap, sum, myReduce); // TODO -- how do we print progress? //printProgress(dataProvider.getShard(), ???); return result; } + private List aggregateMapData(final ReadShardDataProvider dataProvider) { + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + + final List mapData = new ArrayList(); // TODO -- need size of reads + for ( final SAMRecord read : reads ) { + final ReferenceContext refContext = ! read.getReadUnmappedFlag() + ? reference.getReferenceContext(read) + : null; + + // if the read is mapped, create a metadata tracker + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 + ? rodView.getReferenceOrderedDataForRead(read) + : null; + + // update the number of reads we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker)); + } + + return mapData; + } + @Override public void printOnTraversalDone() { nanoScheduler.shutdown(); @@ -116,36 +140,31 @@ public class TraverseReadsNano extends TraversalEngine, } } - private class TraverseReadsMap implements MapFunction { - final ReadView reads; - final ReadReferenceView reference; - final ReadBasedReferenceOrderedView rodView; + private class MapData { + final GATKSAMRecord read; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.read = read; + this.refContext = refContext; + this.tracker = tracker; + } + } + + private class TraverseReadsMap implements MapFunction { final ReadWalker walker; - private TraverseReadsMap(ReadView reads, ReadReferenceView reference, ReadBasedReferenceOrderedView rodView, ReadWalker walker) { - this.reads = reads; - this.reference = reference; - this.rodView = rodView; + private TraverseReadsMap(ReadWalker walker) { this.walker = walker; } @Override - public M apply(final SAMRecord read) { + public M apply(final MapData data) { if ( ! walker.isDone() ) { - // ReferenceContext -- the reference bases covered by the read - final ReferenceContext refContext = ! read.getReadUnmappedFlag() && reference != null - ? reference.getReferenceContext(read) - : null; - - // update the number of reads we've seen - //dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; - - final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); + final boolean keepMeP = walker.filter(data.refContext, data.read); if (keepMeP) { - return walker.map(refContext, (GATKSAMRecord) read, tracker); + return walker.map(data.refContext, data.read, data.tracker); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 4bca3728f..25ed0766d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -43,7 +43,8 @@ import java.util.concurrent.*; * Time: 9:47 AM */ public class NanoScheduler { - private static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; final int bufferSize; final int mapGroupSize; @@ -172,7 +173,7 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); - if ( getnThreads() == 1 ) { + if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { return executeSingleThreaded(inputReader, map, initialValue, reduce); } else { return executeMultiThreaded(inputReader, map, initialValue, reduce); From 27d1c63448384d0d6b6bf74949608c7a92c42ccf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:56:58 -0400 Subject: [PATCH 20/48] Reduce the number of test combinations in ReadBasedREferenceOrderedView --- .../ReadBasedReferenceOrderedViewUnitTest.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index d55c48054..eaa098793 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -121,7 +121,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { // test in the present of a large spanning element { List oneLargeSpan = new ArrayList(handPickedFeatures); - oneLargeSpan.add(new BasicFeature(contig, 1, 100)); + oneLargeSpan.add(new BasicFeature(contig, 1, 30)); createTestsForFeatures(oneLargeSpan); } @@ -135,7 +135,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { // test in the presence of a partially spanning element at the end { List partialSpanEnd = new ArrayList(handPickedFeatures); - partialSpanEnd.add(new BasicFeature(contig, 10, 100)); + partialSpanEnd.add(new BasicFeature(contig, 10, 30)); createTestsForFeatures(partialSpanEnd); } @@ -165,7 +165,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); - for ( final int size : Arrays.asList(1, 5, 10, 100, 1000) ) { + for ( final int size : Arrays.asList(1, 5, 10, 100) ) { final List allIntervals = new ArrayList(); // regularly spaced for ( int start = featuresStart; start < featuresStop; start++) { @@ -256,11 +256,12 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { } // all 3 way pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { - tests.add(new Object[]{singleTest, testStateless}); - } + //for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + // tests.add(new Object[]{singleTest, testStateless}); + //} } + logger.warn("Creating " + tests.size() + " tests for ReadMetaDataTrackerTests"); return tests.toArray(new Object[][]{}); } From 59508f82663ce27637c4a968b831cc6796537f1d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:57:29 -0400 Subject: [PATCH 21/48] tasking for n threads should give you n threads in NanoScheduler, not n - 1 --- .../broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 25ed0766d..668c82524 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -80,7 +80,7 @@ public class NanoScheduler { this.mapGroupSize = mapGroupSize; } - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads - 1); + this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); } /** From 863a3d73b8796510ca1461d759115cf1ed4e2f11 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 16:21:17 -0400 Subject: [PATCH 22/48] Added ThreadSafeMapReduce interface, super of TreeReducible -- A higher level interface to declare parallelism capability of a walker. This interface means that the walker can be multi-threaded, but doesn't necessarily support TreeReducible interface, which forces you to have a combine ReduceType operation that isn't appropriate for parallel read walkers -- Updated ReadWalkers to implement ThreadSafeMapReduce not TreeReducible --- .../sting/gatk/executive/MicroScheduler.java | 19 ++++++++---- .../gatk/iterators/VerifyingSamIterator.java | 5 +-- .../sting/gatk/walkers/FlagStat.java | 7 +---- .../sting/gatk/walkers/PrintReads.java | 7 +---- .../gatk/walkers/ThreadSafeMapReduce.java | 31 +++++++++++++++++++ .../sting/gatk/walkers/TreeReducible.java | 2 +- .../sting/gatk/walkers/qc/CountReads.java | 5 ++- 7 files changed, 52 insertions(+), 24 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 70201a6cc..417a0982f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -100,22 +100,29 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { - if(walker.isReduceByInterval()) + if (threadAllocation.getNumCPUThreads() > 1) { + if (walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - if ( walker instanceof ReadWalker ) + if ( walker instanceof ReadWalker ) { + if ( ! (walker instanceof ThreadSafeMapReduce) ) badNT(engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); - else + } else { + // TODO -- update test for when nano scheduling only is an option + if ( ! (walker instanceof TreeReducible) ) badNT(engine, walker); return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + } } else { - if(threadAllocation.getNumCPUThreads() > 1) - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); } } + private static void badNT(final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + } + /** * Create a microscheduler given the reads and reference. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 2763bca7c..3ffe95e8b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -48,7 +47,9 @@ public class VerifyingSamIterator implements StingSAMIterator { if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); - return last.getAlignmentStart() > cur.getAlignmentStart(); + return (last.getReferenceIndex() > cur.getReferenceIndex()) || + (last.getReferenceIndex().equals(cur.getReferenceIndex()) && + last.getAlignmentStart() > cur.getAlignmentStart()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java index 6f28e8726..14d14aca5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java @@ -45,7 +45,7 @@ import java.text.NumberFormat; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) -public class FlagStat extends ReadWalker implements TreeReducible { +public class FlagStat extends ReadWalker implements ThreadSafeMapReduce { @Output PrintStream out; @@ -193,11 +193,6 @@ public class FlagStat extends ReadWalker implements TreeReducible { +public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; @@ -245,9 +245,4 @@ public class PrintReads extends ReadWalker impleme output.addAlignment(read); return output; } - - @Override - public SAMFileWriter treeReduce(SAMFileWriter lhs, SAMFileWriter rhs) { - return lhs; // nothing to do - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java new file mode 100755 index 000000000..1ce469f8c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010. The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Root parallelism interface. Walkers that implement this + * declare that their map function is thread-safe and so multiple + * map calls can be run in parallel in the same JVM instance. + */ +public interface ThreadSafeMapReduce { +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java index c950e07e4..8621c0e9d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java @@ -13,7 +13,7 @@ package org.broadinstitute.sting.gatk.walkers; * shards of the data can reduce with each other, and the composite result * can be reduced with other composite results. */ -public interface TreeReducible { +public interface TreeReducible extends ThreadSafeMapReduce { /** * A composite, 'reduce of reduces' function. * @param lhs 'left-most' portion of data in the composite reduce. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 72bda03e9..856ea77f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.ThreadSafeMapReduce; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,12 +41,11 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker implements TreeReducible { +public class CountReads extends ReadWalker implements ThreadSafeMapReduce { public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } @Override public Integer reduceInit() { return 0; } @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } - @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } } From 7b4caec8cb45504fbeaf5df2c685dcb131f72c83 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 16:56:36 -0400 Subject: [PATCH 23/48] Fix: GSA-531 ApplyRecalibration writing to BCF: java.lang.String cannot be cast to java.lang.Double -- LOD must be added a double to attributes, not as string, so that it can be written out as BCF --- .../walkers/variantrecalibration/ApplyRecalibration.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 011f3471c..158d1e78a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -39,11 +39,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import java.io.File; import java.util.*; @@ -218,7 +218,7 @@ public class ApplyRecalibration extends RodWalker implements T String filterString = null; // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lodString); // use the String representation so that we don't lose precision on output + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); for( int i = tranches.size() - 1; i >= 0; i-- ) { From 82b2845b9f71cebc76d3a5953ab5a2ad4d8a3fe7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 16:56:36 -0400 Subject: [PATCH 24/48] Fix: GSA-531 ApplyRecalibration writing to BCF: java.lang.String cannot be cast to java.lang.Double -- LOD must be added a double to attributes, not as string, so that it can be written out as BCF --- .../walkers/variantrecalibration/ApplyRecalibration.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 011f3471c..158d1e78a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -39,11 +39,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import java.io.File; import java.util.*; @@ -218,7 +218,7 @@ public class ApplyRecalibration extends RodWalker implements T String filterString = null; // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lodString); // use the String representation so that we don't lose precision on output + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); for( int i = tranches.size() - 1; i >= 0; i-- ) { From 5a142fe2656643ac8d2b6b3c356d83f233d8724b Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 30 Aug 2012 17:57:31 -0400 Subject: [PATCH 25/48] After dicussion with Ryan/Eric, the Structural_Indel variant type is now gone, and has been entirely replaced with the access pattern .isStructuralIndel(). This makes it a strict subtype of indel. I agree that this method is a bit more sensible. In addition, fix for GSA-310. If supplied -rf argument does not match a known read filter, the list of read filters will be printed, and users directed to the documentation for more information. --- .../sting/gatk/filters/FilterManager.java | 26 +++++++++++++++++++ .../VariantDataManager.java | 1 - .../utils/classloader/PluginManager.java | 12 ++++++++- .../utils/variantcontext/VariantContext.java | 22 +++++++++------- 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index 67f82235d..bddfa6a0d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -25,9 +25,13 @@ package org.broadinstitute.sting.gatk.filters; +import com.google.common.base.Function; +import com.google.common.collect.Collections2; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import java.util.Collection; +import java.util.List; /** * Manage filters and filter options. Any requests for basic filtering classes @@ -54,4 +58,26 @@ public class FilterManager extends PluginManager { public Collection> getValues() { return this.getPlugins(); } + + /** + * Rather than use the default error message, print out a list of read filters as well. + * @param pluginCategory - string, the category of the plugin (e.g. read filter) + * @param pluginName - string, what we were trying to match (but failed to) + * @return - A wall of text with the default message, followed by a listing of available read filters + */ + @Override + protected String formatErrorMessage(String pluginCategory, String pluginName) { + List> availableFilters = this.getPluginsImplementing(ReadFilter.class); + Collection availableFilterNames = Collections2.transform(availableFilters, new Function,String>(){ + + @Override + public String apply(final Class input) { + return getName(input); + } + }); + + return String.format("Read filter %s not found. Available read filters:%n%s.%n%n%s",pluginName, + Utils.join(String.format(", "),availableFilterNames), + "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information."); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 33a543e39..aacd987d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -286,7 +286,6 @@ public class VariantDataManager { case INDEL: case MIXED: case SYMBOLIC: - case STRUCTURAL_INDEL: return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL ); default: return false; diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 9a2cb68db..9f1b6db93 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -277,7 +277,7 @@ public class PluginManager { public PluginType createByName(String pluginName) { Class plugin = pluginsByName.get(pluginName); if( plugin == null ) - throw new UserException(String.format("Could not find %s with name: %s", pluginCategory,pluginName)); + throw new UserException(formatErrorMessage(pluginCategory,pluginName)); try { return plugin.newInstance(); } catch (Exception e) { @@ -330,4 +330,14 @@ public class PluginManager { return pluginName; } + + /** + * Generate the error message for the plugin manager. The message is allowed to depend on the class. + * @param pluginCategory - string, the category of the plugin (e.g. read filter) + * @param pluginName - string, what we were trying to match (but failed to) + * @return error message text describing the error + */ + protected String formatErrorMessage(String pluginCategory, String pluginName ) { + return String.format("Could not find %s with name: %s", pluginCategory,pluginName); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 929e53ce7..dd16cf7e1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -457,7 +457,6 @@ public class VariantContext implements Feature { // to enable tribble integratio SNP, MNP, // a multi-nucleotide polymorphism INDEL, - STRUCTURAL_INDEL, SYMBOLIC, MIXED, } @@ -531,7 +530,17 @@ public class VariantContext implements Feature { // to enable tribble integratio } public boolean isStructuralIndel() { - return getType() == Type.STRUCTURAL_INDEL; + if ( getType() == Type.INDEL ) { + List sizes = getIndelLengths(); + if ( sizes != null ) { + for ( Integer length : sizes ) { + if ( length > MAX_ALLELE_SIZE_FOR_NON_SV ) { + return true; + } + } + } + } + return false; } /** @@ -716,7 +725,7 @@ public class VariantContext implements Feature { // to enable tribble integratio * @return a list of indel lengths ( null if not of type indel or mixed ) */ public List getIndelLengths() { - if ( getType() != Type.INDEL && getType() != Type.MIXED && getType() != Type.STRUCTURAL_INDEL ) { + if ( getType() != Type.INDEL && getType() != Type.MIXED ) { return null; } @@ -1263,13 +1272,6 @@ public class VariantContext implements Feature { // to enable tribble integratio // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. - // Because a number of structural variation callers write the whole alternate allele into the VCF where possible, - // this can result in insertion/deletion alleles of structural variant size, e.g. 151+. As of July 2012, we now - // classify these as structural events, rather than indel events, as we think differently about the mechanism, - // representation, and handling of these events. Check for this case here: - if ( ref.length() > MAX_ALLELE_SIZE_FOR_NON_SV || allele.length() > MAX_ALLELE_SIZE_FOR_NON_SV ) - return Type.STRUCTURAL_INDEL; - return Type.INDEL; // old incorrect logic: From 5a9610d87591fb9327e6fac552bdf26cba28a6b3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 10:39:16 -0400 Subject: [PATCH 26/48] ReadShards now default to 10K (up from 1K) reads per samFile up to 250K -- This should help make the inputs for parallel read walkers a little meater, and avoid spinning the shard creation infrastructure so often --- .../sting/gatk/datasources/reads/SAMDataSource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index c8b654f81..2b88775b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -262,7 +262,7 @@ public class SAMDataSource { else { // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. - ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000)); + ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000)); } resourcePool = new SAMResourcePool(Integer.MAX_VALUE); From 7d95176539546585bbc76cfde2866fba64ee83c2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:07:02 -0400 Subject: [PATCH 27/48] Bugfix to compareTo and equals in GenomeLoc -- Yes, GenomeLoc.compareTo was broken. The compareTo function only considered the contig and start position, but not the stop, when comparing genome locs. -- Updated GenomeLoc.compareTo function to account for stop. Updated GATK code where necessary to fix resulting problems that depended on this. -- Added unit tests to ensure that hashcode, equals, and compareTo are all correct for GenomeLocs --- .../gatk/iterators/VerifyingSamIterator.java | 4 +- .../broadinstitute/sting/utils/GenomeLoc.java | 5 +- .../sting/utils/GenomeLocUnitTest.java | 56 +++++++++++++++++++ 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index f33dd414b..2763bca7c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -48,9 +48,7 @@ public class VerifyingSamIterator implements StingSAMIterator { if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); - GenomeLoc lastLoc = genomeLocParser.createGenomeLoc( last ); - GenomeLoc curLoc = genomeLocParser.createGenomeLoc( cur ); - return curLoc.compareTo(lastLoc) == -1; + return last.getAlignmentStart() > cur.getAlignmentStart(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 0b35dd599..6df9c9f1d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -427,7 +427,10 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome result = cmpContig; } else { if ( this.getStart() < that.getStart() ) result = -1; - if ( this.getStart() > that.getStart() ) result = 1; + else if ( this.getStart() > that.getStart() ) result = 1; + // these have the same start, so check the ends + else if ( this.getStop() < that.getStop() ) result = -1; + else if ( this.getStop() > that.getStop() ) result = 1; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index 49778a4d8..122e0265f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -211,4 +212,59 @@ public class GenomeLocUnitTest extends BaseTest { Assert.assertEquals(cfg.gl1.reciprocialOverlapFraction(cfg.gl2), cfg.overlapFraction); } } + + // ------------------------------------------------------------------------------------- + // + // testing comparison, hashcode, and equals + // + // ------------------------------------------------------------------------------------- + + @DataProvider(name = "GenomeLocComparisons") + public Object[][] createGenomeLocComparisons() { + List tests = new ArrayList(); + + final int start = 10; + for ( int stop = start; stop < start + 3; stop++ ) { + final GenomeLoc g1 = genomeLocParser.createGenomeLoc("chr2", start, stop); + for ( final String contig : Arrays.asList("chr1", "chr2", "chr3")) { + for ( int start2 = start - 1; start2 <= stop + 1; start2++ ) { + for ( int stop2 = start2; stop2 < stop + 2; stop2++ ) { + final GenomeLoc g2 = genomeLocParser.createGenomeLoc(contig, start2, stop2); + + ComparisonResult cmp = ComparisonResult.EQUALS; + if ( contig.equals("chr3") ) cmp = ComparisonResult.LESS_THAN; + else if ( contig.equals("chr1") ) cmp = ComparisonResult.GREATER_THAN; + else if ( start < start2 ) cmp = ComparisonResult.LESS_THAN; + else if ( start > start2 ) cmp = ComparisonResult.GREATER_THAN; + else if ( stop < stop2 ) cmp = ComparisonResult.LESS_THAN; + else if ( stop > stop2 ) cmp = ComparisonResult.GREATER_THAN; + + tests.add(new Object[]{g1, g2, cmp}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private enum ComparisonResult { + LESS_THAN(-1), + EQUALS(0), + GREATER_THAN(1); + + final int cmp; + + private ComparisonResult(int cmp) { + this.cmp = cmp; + } + } + + @Test(dataProvider = "GenomeLocComparisons") + public void testGenomeLocComparisons(GenomeLoc g1, GenomeLoc g2, ComparisonResult expected) { + Assert.assertEquals(g1.compareTo(g2), expected.cmp, "Comparing genome locs failed"); + Assert.assertEquals(g1.equals(g2), expected == ComparisonResult.EQUALS); + if ( expected == ComparisonResult.EQUALS ) + Assert.assertEquals(g1.hashCode(), g2.hashCode(), "Equal genome locs don't have the same hash code"); + } } From 7a462399cee869fa345afa3da6b00d14084f9edd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:10:58 -0400 Subject: [PATCH 28/48] Fix GSA-529: Fix RODs for parallel read walkers -- TraverseReadsNano modified to read in all input data before invoking maps, so the input to TraverseReadsNano is a MapData object holding the sam record, the ref context, and the refmetadatatracker. -- Update ValidateRODForReads to be tree reducible, using synchronized map and explicitly sort the output map from locations -> counts in onTraversalDone -- Expanded integration tests to test nt 1, 2, 4. --- .../gatk/traversals/TraverseReadsNano.java | 91 +++++++++++-------- .../utils/nanoScheduler/NanoScheduler.java | 5 +- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 081c6b8fc..b397cb8c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -27,16 +27,21 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.ArrayList; +import java.util.List; + /** * @author aaron * @version 1.0 @@ -50,12 +55,13 @@ public class TraverseReadsNano extends TraversalEngine, /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; + private static final int MIN_GROUP_SIZE = 100; + final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - final int mapGroupSize = bufferSize / 10 + 1; - nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); + final int mapGroupSize = (int)Math.max(Math.ceil(bufferSize / 50.0 + 1), MIN_GROUP_SIZE); + nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); } @Override @@ -79,24 +85,42 @@ public class TraverseReadsNano extends TraversalEngine, if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - if ( dataProvider.hasReferenceOrderedData() ) - throw new ReviewedStingException("Parallel read walkers currently don't support access to reference ordered data"); - - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - nanoScheduler.setDebug(DEBUG); - final TraverseReadsMap myMap = new TraverseReadsMap(reads, reference, rodView, walker); + final TraverseReadsMap myMap = new TraverseReadsMap(walker); final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - T result = nanoScheduler.execute(reads.iterator().iterator(), myMap, sum, myReduce); + T result = nanoScheduler.execute(aggregateMapData(dataProvider).iterator(), myMap, sum, myReduce); // TODO -- how do we print progress? //printProgress(dataProvider.getShard(), ???); return result; } + private List aggregateMapData(final ReadShardDataProvider dataProvider) { + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + + final List mapData = new ArrayList(); // TODO -- need size of reads + for ( final SAMRecord read : reads ) { + final ReferenceContext refContext = ! read.getReadUnmappedFlag() + ? reference.getReferenceContext(read) + : null; + + // if the read is mapped, create a metadata tracker + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 + ? rodView.getReferenceOrderedDataForRead(read) + : null; + + // update the number of reads we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker)); + } + + return mapData; + } + @Override public void printOnTraversalDone() { nanoScheduler.shutdown(); @@ -116,36 +140,31 @@ public class TraverseReadsNano extends TraversalEngine, } } - private class TraverseReadsMap implements MapFunction { - final ReadView reads; - final ReadReferenceView reference; - final ReadBasedReferenceOrderedView rodView; + private class MapData { + final GATKSAMRecord read; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.read = read; + this.refContext = refContext; + this.tracker = tracker; + } + } + + private class TraverseReadsMap implements MapFunction { final ReadWalker walker; - private TraverseReadsMap(ReadView reads, ReadReferenceView reference, ReadBasedReferenceOrderedView rodView, ReadWalker walker) { - this.reads = reads; - this.reference = reference; - this.rodView = rodView; + private TraverseReadsMap(ReadWalker walker) { this.walker = walker; } @Override - public M apply(final SAMRecord read) { + public M apply(final MapData data) { if ( ! walker.isDone() ) { - // ReferenceContext -- the reference bases covered by the read - final ReferenceContext refContext = ! read.getReadUnmappedFlag() && reference != null - ? reference.getReferenceContext(read) - : null; - - // update the number of reads we've seen - //dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; - - final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); + final boolean keepMeP = walker.filter(data.refContext, data.read); if (keepMeP) { - return walker.map(refContext, (GATKSAMRecord) read, tracker); + return walker.map(data.refContext, data.read, data.tracker); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 4bca3728f..25ed0766d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -43,7 +43,8 @@ import java.util.concurrent.*; * Time: 9:47 AM */ public class NanoScheduler { - private static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; final int bufferSize; final int mapGroupSize; @@ -172,7 +173,7 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); - if ( getnThreads() == 1 ) { + if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { return executeSingleThreaded(inputReader, map, initialValue, reduce); } else { return executeMultiThreaded(inputReader, map, initialValue, reduce); From 1212dfd2ef97a6847c0a2189c47c36faf1a1b54d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:56:58 -0400 Subject: [PATCH 29/48] Reduce the number of test combinations in ReadBasedREferenceOrderedView --- .../ReadBasedReferenceOrderedViewUnitTest.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index d55c48054..eaa098793 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -121,7 +121,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { // test in the present of a large spanning element { List oneLargeSpan = new ArrayList(handPickedFeatures); - oneLargeSpan.add(new BasicFeature(contig, 1, 100)); + oneLargeSpan.add(new BasicFeature(contig, 1, 30)); createTestsForFeatures(oneLargeSpan); } @@ -135,7 +135,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { // test in the presence of a partially spanning element at the end { List partialSpanEnd = new ArrayList(handPickedFeatures); - partialSpanEnd.add(new BasicFeature(contig, 10, 100)); + partialSpanEnd.add(new BasicFeature(contig, 10, 30)); createTestsForFeatures(partialSpanEnd); } @@ -165,7 +165,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); - for ( final int size : Arrays.asList(1, 5, 10, 100, 1000) ) { + for ( final int size : Arrays.asList(1, 5, 10, 100) ) { final List allIntervals = new ArrayList(); // regularly spaced for ( int start = featuresStart; start < featuresStop; start++) { @@ -256,11 +256,12 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { } // all 3 way pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { - tests.add(new Object[]{singleTest, testStateless}); - } + //for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + // tests.add(new Object[]{singleTest, testStateless}); + //} } + logger.warn("Creating " + tests.size() + " tests for ReadMetaDataTrackerTests"); return tests.toArray(new Object[][]{}); } From 544740d45de3cfd59090e817da8725826bffa73b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:57:29 -0400 Subject: [PATCH 30/48] tasking for n threads should give you n threads in NanoScheduler, not n - 1 --- .../broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 25ed0766d..668c82524 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -80,7 +80,7 @@ public class NanoScheduler { this.mapGroupSize = mapGroupSize; } - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads - 1); + this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); } /** From 2f749b5e5271a5ecacfbe406461772e86011fb0f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 16:21:17 -0400 Subject: [PATCH 31/48] Added ThreadSafeMapReduce interface, super of TreeReducible -- A higher level interface to declare parallelism capability of a walker. This interface means that the walker can be multi-threaded, but doesn't necessarily support TreeReducible interface, which forces you to have a combine ReduceType operation that isn't appropriate for parallel read walkers -- Updated ReadWalkers to implement ThreadSafeMapReduce not TreeReducible --- .../sting/gatk/executive/MicroScheduler.java | 19 ++++++++---- .../gatk/iterators/VerifyingSamIterator.java | 5 +-- .../sting/gatk/walkers/FlagStat.java | 7 +---- .../sting/gatk/walkers/PrintReads.java | 7 +---- .../gatk/walkers/ThreadSafeMapReduce.java | 31 +++++++++++++++++++ .../sting/gatk/walkers/TreeReducible.java | 2 +- .../sting/gatk/walkers/qc/CountReads.java | 5 ++- 7 files changed, 52 insertions(+), 24 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 70201a6cc..417a0982f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -100,22 +100,29 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { - if(walker.isReduceByInterval()) + if (threadAllocation.getNumCPUThreads() > 1) { + if (walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - if ( walker instanceof ReadWalker ) + if ( walker instanceof ReadWalker ) { + if ( ! (walker instanceof ThreadSafeMapReduce) ) badNT(engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); - else + } else { + // TODO -- update test for when nano scheduling only is an option + if ( ! (walker instanceof TreeReducible) ) badNT(engine, walker); return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + } } else { - if(threadAllocation.getNumCPUThreads() > 1) - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); } } + private static void badNT(final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + } + /** * Create a microscheduler given the reads and reference. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 2763bca7c..3ffe95e8b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -48,7 +47,9 @@ public class VerifyingSamIterator implements StingSAMIterator { if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); - return last.getAlignmentStart() > cur.getAlignmentStart(); + return (last.getReferenceIndex() > cur.getReferenceIndex()) || + (last.getReferenceIndex().equals(cur.getReferenceIndex()) && + last.getAlignmentStart() > cur.getAlignmentStart()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java index 6f28e8726..14d14aca5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java @@ -45,7 +45,7 @@ import java.text.NumberFormat; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) -public class FlagStat extends ReadWalker implements TreeReducible { +public class FlagStat extends ReadWalker implements ThreadSafeMapReduce { @Output PrintStream out; @@ -193,11 +193,6 @@ public class FlagStat extends ReadWalker implements TreeReducible { +public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; @@ -245,9 +245,4 @@ public class PrintReads extends ReadWalker impleme output.addAlignment(read); return output; } - - @Override - public SAMFileWriter treeReduce(SAMFileWriter lhs, SAMFileWriter rhs) { - return lhs; // nothing to do - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java new file mode 100755 index 000000000..1ce469f8c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010. The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Root parallelism interface. Walkers that implement this + * declare that their map function is thread-safe and so multiple + * map calls can be run in parallel in the same JVM instance. + */ +public interface ThreadSafeMapReduce { +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java index c950e07e4..8621c0e9d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java @@ -13,7 +13,7 @@ package org.broadinstitute.sting.gatk.walkers; * shards of the data can reduce with each other, and the composite result * can be reduced with other composite results. */ -public interface TreeReducible { +public interface TreeReducible extends ThreadSafeMapReduce { /** * A composite, 'reduce of reduces' function. * @param lhs 'left-most' portion of data in the composite reduce. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 72bda03e9..856ea77f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.ThreadSafeMapReduce; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,12 +41,11 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker implements TreeReducible { +public class CountReads extends ReadWalker implements ThreadSafeMapReduce { public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } @Override public Integer reduceInit() { return 0; } @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } - @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } } From 39400c56a95f5221b98067cd866f4d4f9a04a572 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 19:41:36 -0400 Subject: [PATCH 32/48] Update md5s for VQSR, as VQSLOD is now a double and gets the standard double precision treatment in VCF --- ...VariantRecalibrationWalkersIntegrationTest.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index b780bcd00..aec087f2c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { private static class VRTest { @@ -28,7 +28,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", "f360ce3eb2b0b887301be917a9843e2b", // tranches "287fea5ea066bf3fdd71f5ce9b58eab3", // recal file - "356b9570817b9389da71fbe991d8b2f5"); // cut VCF + "afa297c743437551cc2bd36ddd6d6d75"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -77,7 +77,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", "a8ce3cd3dccafdf7d580bcce7d660a9a", // tranches "74c10fc15f9739a938b7138909fbde04", // recal file - "62fda105e14b619a1c263855cf56af1d"); // cut VCF + "c30d163871a37f2bbf8ee7f761e870b4"); // cut VCF @DataProvider(name = "VRBCFTest") public Object[][] createVRBCFTest() { @@ -129,13 +129,13 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . "b7589cd098dc153ec64c02dcff2838e4", // tranches "a04a9001f62eff43d363f4d63769f3ee", // recal file - "64f576881e21323dd4078262604717a2"); // cut VCF + "b2c6827be592c24a4692b1753edc7d23"); // cut VCF VRTest indelFiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS "b7589cd098dc153ec64c02dcff2838e4", // tranches "a04a9001f62eff43d363f4d63769f3ee", // recal file - "af22c55d91394c56a222fd40d6d54781"); // cut VCF + "5d483fe1ba2ef36ee9e6c14cbd654706"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createTestVariantRecalibratorIndel() { @@ -193,7 +193,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -o %s" + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("ec519e1f01459813dab57aefffc019e2")); + Arrays.asList("018b3a5cc7cf0cb5468c6a0c80ccaa8b")); executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); } } From ac0c44720b4c5d616bc15587b3742b440ee0d008 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Aug 2012 22:49:13 -0400 Subject: [PATCH 33/48] I started to put together a set of unit tests for the PileupElement creation functionality of LocusIteratorByState and found pretty quickly that it's definitely still busted for indels. The data provider is nowhere near comprehensive yet, but I need to sit back and think about how to really test some of the functionality of LIBS. Committing what I have for now because at the very least it'll be helpful going forward (failing tests are commented out with TODO). --- .../LocusIteratorByStateUnitTest.java | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index edd97f17f..4480acacd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -19,6 +19,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; @@ -255,6 +256,90 @@ public class LocusIteratorByStateUnitTest extends BaseTest { } } + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; + private static final int IS_BEFORE_DELETION_START_FLAG = 2; + private static final int IS_AFTER_DELETED_BASE_FLAG = 4; + private static final int IS_AFTER_DELETION_END_FLAG = 8; + private static final int IS_BEFORE_INSERTION_FLAG = 16; + private static final int IS_AFTER_INSERTION_FLAG = 32; + private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; + + private static class LIBSTest { + + + final String cigar; + final int readLength; + final List offsets; + final List flags; + + private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + this.cigar = cigar; + this.readLength = readLength; + this.offsets = offsets; + this.flags = flags; + } + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTestData() { + return new Object[][]{ + {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, + {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + //TODO -- uncomment these when LIBS is fixed + //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, + //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, + //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, + {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, + {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + }; + } + + @Test(dataProvider = "LIBSTest") + public void testLIBS(LIBSTest params) { + final int locus = 44367788; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); + read.setCigarString(params.cigar); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + + int offset = 0; + while ( li.hasNext() ) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + + final int flag = params.flags.get(offset); + Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); + Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + + Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); + + offset++; + } + } + + //////////////////////////////////////////////// + // End comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////////// + private static ReadProperties createTestReadProperties() { return new ReadProperties( Collections.emptyList(), From 817ece37a20cf935a9f38cc27b7618e45f5e1dfd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 11:42:50 -0400 Subject: [PATCH 36/48] General infrastructure for ReadTransformers -- These are like read filters but can be applied either on input, on output, of handled by the walker -- Previous example of BAQ now uses the general framework -- Resulted in massive conceptual cleanup of SAMDataSource and ReadProperties! Yeah! -- BQSR now uses this framework. We can now do BQSR on input, on output, or within a walker -- PrintReads now handles all read transformers in the walker in map, enabling us to parallelize PrintReads with BAQ and BQSR -- Currently BQSR is excepting in parallel, which subsequent commit with fix -- Removed global variable setting in GenomeAnalysisEngine for BAQ, as command line parameters are cleanly handled by ReadTransformer infrastructure -- In principle ReadFilters are just a special kind of ReadTransformer, but this refactoring is larger than I can do. It's a JIRA entry -- Many files touched simply due to the refactoring and renaming of classes --- .../haplotypecaller/HaplotypeCaller.java | 14 +- .../sting/gatk/GenomeAnalysisEngine.java | 58 +++++-- .../sting/gatk/ReadProperties.java | 38 ++--- .../sting/gatk/WalkerManager.java | 9 +- .../gatk/datasources/reads/SAMDataSource.java | 41 ++--- .../gatk/io/stubs/SAMFileWriterStub.java | 40 +++-- .../sting/gatk/iterators/ReadTransformer.java | 144 ++++++++++++++++++ .../gatk/iterators/ReadTransformersMode.java | 28 ++++ .../sting/gatk/walkers/BAQMode.java | 4 +- .../sting/gatk/walkers/PrintReads.java | 20 ++- .../sting/gatk/walkers/Walker.java | 5 +- .../gatk/walkers/bqsr/BaseRecalibrator.java | 6 +- .../walkers/genotyper/UnifiedGenotyper.java | 3 +- .../gatk/walkers/indels/IndelRealigner.java | 3 +- .../indels/RealignerTargetCreator.java | 4 +- .../broadinstitute/sting/utils/baq/BAQ.java | 20 +-- .../sting/utils/baq/BAQReadTransformer.java | 49 ++++++ .../sting/utils/baq/BAQSamIterator.java | 59 ------- .../utils/baq/ReadTransformingIterator.java | 44 ++++++ .../sting/utils/recalibration/BQSRMode.java | 30 ++++ .../recalibration/BQSRReadTransformer.java | 40 +++++ .../utils/recalibration/BQSRSamIterator.java | 50 ------ 22 files changed, 485 insertions(+), 224 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 3d41b7233..f4d8a88e0 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -27,24 +27,23 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.collections.Pair; @@ -52,6 +51,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -101,7 +101,7 @@ import java.util.*; @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.LOCUS) -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) @ActiveRegionExtension(extension=65, maxRegion=300) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 00614b9aa..b9b5e452d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -42,6 +42,8 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; @@ -49,8 +51,8 @@ import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -131,6 +133,11 @@ public class GenomeAnalysisEngine { */ private Collection filters; + /** + * Collection of the read transformers applied to the reads + */ + private List readTransformers; + /** * Controls the allocation of threads between CPU vs IO. */ @@ -354,6 +361,39 @@ public class GenomeAnalysisEngine { return Collections.unmodifiableList(filters); } + /** + * Returns a list of active, initialized read transformers + * + * @param walker the walker we need to apply read transformers too + * @return a non-null list of read transformers + */ + public void initializeReadTransformers(final Walker walker) { + final List activeTransformers = new ArrayList(); + + final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); + final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; + + final PluginManager pluginManager = new PluginManager(ReadTransformer.class); + + for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { + transformer.initialize(overrideTime, this, walker); + if ( transformer.enabled() ) + activeTransformers.add(transformer); + } + + setReadTransformers(activeTransformers); + } + + public List getReadTransformers() { + return readTransformers; + } + + private void setReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new ReviewedStingException("read transformers cannot be null"); + this.readTransformers = readTransformers; + } + /** * Parse out the thread allocation from the given command-line argument. */ @@ -419,9 +459,6 @@ public class GenomeAnalysisEngine { argCollection.setDownsamplingMethod(method); } - public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); } - public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); } - protected boolean includeReadsWithDeletionAtLoci() { return walker.includeReadsWithDeletionAtLoci(); } @@ -702,13 +739,12 @@ public class GenomeAnalysisEngine { protected void initializeDataSources() { logger.info("Strictness is " + argCollection.strictnessLevel); - // TODO -- REMOVE ME - BAQ.DEFAULT_GOP = argCollection.BAQGOP; - validateSuppliedReference(); setReferenceDataSource(argCollection.referenceFile); validateSuppliedReads(); + initializeReadTransformers(walker); + readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); for (ReadFilter filter : filters) @@ -795,9 +831,6 @@ public class GenomeAnalysisEngine { // interrogating for the downsample method during command line recreation. setDownsamplingMethod(method); - if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF) - throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested."); - if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); @@ -817,11 +850,8 @@ public class GenomeAnalysisEngine { method, new ValidationExclusion(Arrays.asList(argCollection.unsafe)), filters, + readTransformers, includeReadsWithDeletionAtLoci(), - getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, - getWalkerBAQQualityMode(), - refReader, - getBaseRecalibration(), argCollection.defaultBaseQualities, removeProgramRecords); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index e02b9d5af..b2d4d202d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -1,15 +1,14 @@ package org.broadinstitute.sting.gatk; -import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import java.util.Collection; +import java.util.List; /** * User: hanna * Date: May 14, 2009 @@ -34,12 +33,9 @@ public class ReadProperties { private final DownsamplingMethod downsamplingMethod; private final ValidationExclusion exclusionList; private final Collection supplementalFilters; + private final List readTransformers; private final boolean includeReadsWithDeletionAtLoci; private final boolean useOriginalBaseQualities; - private final BAQ.CalculationMode cmode; - private final BAQ.QualityMode qmode; - private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired - private final BaseRecalibration bqsrApplier; private final byte defaultBaseQualities; /** @@ -95,6 +91,11 @@ public class ReadProperties { return supplementalFilters; } + + public List getReadTransformers() { + return readTransformers; + } + /** * Return whether to use original base qualities. * @return Whether to use original base qualities. @@ -103,16 +104,6 @@ public class ReadProperties { return useOriginalBaseQualities; } - - public BAQ.QualityMode getBAQQualityMode() { return qmode; } - public BAQ.CalculationMode getBAQCalculationMode() { return cmode; } - - public IndexedFastaSequenceFile getRefReader() { - return refReader; - } - - public BaseRecalibration getBQSRApplier() { return bqsrApplier; } - /** * @return Default base quality value to fill reads missing base quality information. */ @@ -134,9 +125,6 @@ public class ReadProperties { * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method * will explicitly list reads with deletion over the current reference base; otherwise, only observed * bases will be seen in the pileups, and the deletions will be skipped silently. - * @param cmode How should we apply the BAQ calculation to the reads? - * @param qmode How should we apply the BAQ calculation to the reads? - * @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. */ public ReadProperties( Collection samFiles, @@ -146,11 +134,8 @@ public class ReadProperties { DownsamplingMethod downsamplingMethod, ValidationExclusion exclusionList, Collection supplementalFilters, + List readTransformers, boolean includeReadsWithDeletionAtLoci, - BAQ.CalculationMode cmode, - BAQ.QualityMode qmode, - IndexedFastaSequenceFile refReader, - BaseRecalibration bqsrApplier, byte defaultBaseQualities) { this.readers = samFiles; this.header = header; @@ -158,12 +143,9 @@ public class ReadProperties { this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; this.supplementalFilters = supplementalFilters; + this.readTransformers = readTransformers; this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; this.useOriginalBaseQualities = useOriginalBaseQualities; - this.cmode = cmode; - this.qmode = qmode; - this.refReader = refReader; - this.bqsrApplier = bqsrApplier; this.defaultBaseQualities = defaultBaseQualities; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index 8843d4bfe..ae59ce438 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -29,13 +29,14 @@ import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet; import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import java.lang.annotation.Annotation; import java.util.*; /** @@ -319,11 +320,11 @@ public class WalkerManager extends PluginManager { return downsamplingMethod; } - public static BAQ.QualityMode getBAQQualityMode(Walker walker) { - return walker.getClass().getAnnotation(BAQMode.class).QualityMode(); + public static T getWalkerAnnotation(final Walker walker, final Class clazz) { + return walker.getClass().getAnnotation(clazz); } - public static BAQ.ApplicationTime getBAQApplicationTime(Walker walker) { + public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) { return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 2b88775b1..7d027438b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.sam.MergingSamRecordIterator; import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.samtools.*; @@ -42,12 +41,9 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.baq.BAQSamIterator; +import org.broadinstitute.sting.utils.baq.ReadTransformingIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator; -import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; @@ -200,11 +196,8 @@ public class SAMDataSource { downsamplingMethod, exclusionList, supplementalFilters, + Collections.emptyList(), includeReadsWithDeletionAtLoci, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1, false); } @@ -234,11 +227,8 @@ public class SAMDataSource { DownsamplingMethod downsamplingMethod, ValidationExclusion exclusionList, Collection supplementalFilters, + List readTransformers, boolean includeReadsWithDeletionAtLoci, - BAQ.CalculationMode cmode, - BAQ.QualityMode qmode, - IndexedFastaSequenceFile refReader, - BaseRecalibration bqsrApplier, byte defaultBaseQualities, boolean removeProgramRecords) { this.readMetrics = new ReadMetrics(); @@ -308,11 +298,8 @@ public class SAMDataSource { downsamplingMethod, exclusionList, supplementalFilters, + readTransformers, includeReadsWithDeletionAtLoci, - cmode, - qmode, - refReader, - bqsrApplier, defaultBaseQualities); // cache the read group id (original) -> read group id (merged) @@ -603,10 +590,7 @@ public class SAMDataSource { readProperties.getDownsamplingMethod().toFraction, readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), readProperties.getSupplementalFilters(), - readProperties.getBAQCalculationMode(), - readProperties.getBAQQualityMode(), - readProperties.getRefReader(), - readProperties.getBQSRApplier(), + readProperties.getReadTransformers(), readProperties.defaultBaseQualities()); } @@ -673,10 +657,7 @@ public class SAMDataSource { Double downsamplingFraction, Boolean noValidationOfReadOrder, Collection supplementalFilters, - BAQ.CalculationMode cmode, - BAQ.QualityMode qmode, - IndexedFastaSequenceFile refReader, - BaseRecalibration bqsrApplier, + List readTransformers, byte defaultBaseQualities) { // *********************************************************************************** // @@ -698,11 +679,11 @@ public class SAMDataSource { // only wrap if we are replacing the original qualities or using a default base quality wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); - if (bqsrApplier != null) - wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier); - - if (cmode != BAQ.CalculationMode.OFF) - wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode); + // set up read transformers + for ( final ReadTransformer readTransformer : readTransformers ) { + if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) + wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); + } return wrappedIterator; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java index d8e59a3dd..d2e7066e9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java @@ -31,12 +31,16 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; /** * A stub for routing and management of SAM file reading and writing. @@ -116,15 +120,15 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite */ private boolean simplifyBAM = false; + private List onOutputReadTransformers = null; + /** * Create a new stub given the requested SAM file and compression level. * @param engine source of header data, maybe other data about input files. * @param samFile SAM file to (ultimately) create. */ public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) { - this.engine = engine; - this.samFile = samFile; - this.samOutputStream = null; + this(engine, samFile, null); } /** @@ -133,8 +137,12 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite * @param stream Output stream to which data should be written. */ public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) { + this(engine, null, stream); + } + + private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) { this.engine = engine; - this.samFile = null; + this.samFile = samFile; this.samOutputStream = stream; } @@ -274,17 +282,29 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite this.headerOverride = header; } + private void initializeReadTransformers() { + this.onOutputReadTransformers = new ArrayList(engine.getReadTransformers().size()); + for ( final ReadTransformer transformer : engine.getReadTransformers() ) { + if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT ) + onOutputReadTransformers.add(transformer); + } + } + /** * @{inheritDoc} */ - public void addAlignment( SAMRecord alignment ) { - if ( engine.getArguments().BAQMode != BAQ.CalculationMode.OFF && engine.getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_OUTPUT ) { - //System.out.printf("Writing BAQ at OUTPUT TIME%n"); - baqHMM.baqRead(alignment, engine.getReferenceDataSource().getReference(), engine.getArguments().BAQMode, engine.getWalkerBAQQualityMode()); - } + public void addAlignment( final SAMRecord readIn ) { + if ( onOutputReadTransformers == null ) + initializeReadTransformers(); + + GATKSAMRecord workingRead = (GATKSAMRecord)readIn; + + // run on output read transformers + for ( final ReadTransformer transform : onOutputReadTransformers ) + workingRead = transform.apply(workingRead); writeStarted = true; - outputTracker.getStorage(this).addAlignment(alignment); + outputTracker.getStorage(this).addAlignment(workingRead); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java new file mode 100644 index 000000000..d307789f3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java @@ -0,0 +1,144 @@ +package org.broadinstitute.sting.gatk.iterators; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Baseclass used to describe a read transformer like BAQ and BQSR + * + * Read transformers are plugable infrastructure that modify read state + * either on input, on output, or within walkers themselves. + * + * The function apply() is called on each read seen by the GATK (after passing + * all ReadFilters) and it can do as it sees fit (without modifying the alignment) + * to the read to change qualities, add tags, etc. + * + * Initialize is called once right before the GATK traversal begins providing + * the ReadTransformer with the ability to collect and initialize data from the + * engine. + * + * Note that all ReadTransformers within the classpath are created and initialized. If one + * shouldn't be run it should look at the command line options of the engine and override + * the enabled. + * + * @since 8/31/12 + * @author depristo + */ +abstract public class ReadTransformer { + /** + * When should this read transform be applied? + */ + private ApplicationTime applicationTime; + + /** + * Keep track of whether we've been initialized already, and ensure it's not called more than once. + */ + private boolean initialized = false; + + protected ReadTransformer() {} + + /** + * Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine. + * + * @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself + * @param engine the engine, for initializing values + * @param walker the walker we intend to run + */ + @Requires({"initialized == false", "engine != null", "walker == null"}) + @Ensures("initialized == true") + public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) { + if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); + if ( walker == null ) throw new IllegalArgumentException("walker cannot be null"); + + this.applicationTime = initializeSub(engine, walker); + if ( overrideTime != null ) this.applicationTime = overrideTime; + initialized = true; + } + + /** + * Subclasses must override this to initialize themeselves + * + * @param engine the engine, for initializing values + * @param walker the walker we intend to run + * @return the point of time we'd like this read transform to be run + */ + @Requires({"engine != null", "walker != null"}) + @Ensures("result != null") + protected abstract ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker); + + /** + * Should this ReadTransformer be activated? Called after initialize, which allows this + * read transformer to look at its arguments and decide if it should be active. All + * ReadTransformers must override this, as by default they are not enabled. + * + * @return true if this ReadTransformer should be used on the read stream + */ + public boolean enabled() { + return false; + } + + /** + * Has this transformer been initialized? + * + * @return true if it has + */ + public final boolean isInitialized() { + return initialized; + } + + /** + * When should we apply this read transformer? + * + * @return true if yes + */ + public final ApplicationTime getApplicationTime() { + return applicationTime; + } + + /** + * Primary interface function for a read transform to actually do some work + * + * The function apply() is called on each read seen by the GATK (after passing + * all ReadFilters) and it can do as it sees fit (without modifying the alignment) + * to the read to change qualities, add tags, etc. + * + * @param read the read to transform + * @return the transformed read + */ + @Requires("read != null") + @Ensures("result != null") + abstract public GATKSAMRecord apply(final GATKSAMRecord read); + + @Override + public String toString() { + return getClass().getSimpleName(); + } + + /** + * When should a read transformer be applied? + */ + public static enum ApplicationTime { + /** + * Walker does not tolerate this read transformer + */ + FORBIDDEN, + + /** + * apply the transformation to the incoming reads, the default + */ + ON_INPUT, + + /** + * apply the transformation to the outgoing read stream + */ + ON_OUTPUT, + + /** + * the walker will deal with the calculation itself + */ + HANDLED_IN_WALKER + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java new file mode 100644 index 000000000..be227619f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.gatk.iterators; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ReadTransformersMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java index 03097887d..42582f178 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; + import java.lang.annotation.*; /** @@ -25,5 +27,5 @@ import java.lang.annotation.*; @Target(ElementType.TYPE) public @interface BAQMode { public abstract org.broadinstitute.sting.utils.baq.BAQ.QualityMode QualityMode() default org.broadinstitute.sting.utils.baq.BAQ.QualityMode.OVERWRITE_QUALS; - public abstract org.broadinstitute.sting.utils.baq.BAQ.ApplicationTime ApplicationTime() default org.broadinstitute.sting.utils.baq.BAQ.ApplicationTime.ON_INPUT; + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index 52ed20ef9..dca23ae66 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -32,6 +32,8 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; @@ -91,7 +93,8 @@ import java.util.TreeSet; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) +@ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @Requires({DataSource.READS, DataSource.REFERENCE}) public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { @@ -217,11 +220,20 @@ public class PrintReads extends ReadWalker impleme * The reads map function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a GATKSAMRecord + * @param readIn the read itself, as a GATKSAMRecord * @return the read itself */ - public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker ) { - return simplifyReads ? read.simplify() : read; + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { + GATKSAMRecord workingRead = readIn; + + for ( final ReadTransformer transformer : getToolkit().getReadTransformers() ) { + if ( logger.isDebugEnabled() ) logger.debug("Applying transformer " + transformer + " to read " + readIn.getReadName()); + workingRead = transformer.apply(workingRead); + } + + if ( simplifyReads ) workingRead = workingRead.simplify(); + + return workingRead; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 6cd2e8aea..4478f8515 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -30,12 +30,14 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.MalformedReadFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.recalibration.BQSRMode; import java.util.List; @@ -48,7 +50,8 @@ import java.util.List; */ @ReadFilters(MalformedReadFilter.class) @PartitionBy(PartitionType.NONE) -@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) +@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) +@BQSRMode(ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) @DocumentedGATKFeature(groupName = "Uncategorized", extraDocs = {CommandLineGATK.class}) public abstract class Walker { final protected static Logger logger = Logger.getLogger(Walker.class); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 30d2e24ef..443b493be 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -32,10 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -46,6 +45,7 @@ import org.broadinstitute.sting.utils.recalibration.QuantizationInfo; import org.broadinstitute.sting.utils.recalibration.RecalUtils; import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -104,7 +104,7 @@ import java.util.ArrayList; */ @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) @By(DataSource.READS) @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 507806fbe..93928a780 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; @@ -117,7 +118,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) @ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index d9b71f938..76d8d85c2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -111,7 +112,7 @@ import java.util.*; * @author ebanks */ @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_OUTPUT) public class IndelRealigner extends ReadWalker { public static final String ORIGINAL_CIGAR_TAG = "OC"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index fc6df6902..a52d57031 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -33,10 +33,10 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -101,7 +101,7 @@ import java.util.TreeSet; @Reference(window=@Window(start=-1,stop=50)) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) public class RealignerTargetCreator extends RodWalker implements TreeReducible { /** diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index 439a0d8ed..cf4d699ee 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -52,13 +52,6 @@ public class BAQ { DONT_MODIFY // do the BAQ, but don't modify the quality scores themselves, just return them in the function. } - public enum ApplicationTime { - FORBIDDEN, // Walker does not tolerate BAQ input - ON_INPUT, // apply the BAQ calculation to the incoming reads, the default - ON_OUTPUT, // apply the BAQ calculation to outgoing read streams - HANDLED_IN_WALKER // the walker will deal with the BAQ calculation status itself - } - public static final String BAQ_TAG = "BQ"; private static double[] qual2prob = new double[256]; @@ -68,7 +61,7 @@ public class BAQ { } // Phred scaled now (changed 1/10/2011) - public static double DEFAULT_GOP = 40; + public static final double DEFAULT_GOP = 40; /* Takes a Phred Scale quality score and returns the error probability. * @@ -110,10 +103,19 @@ public class BAQ { * Use defaults for everything */ public BAQ() { - cd = convertFromPhredScale(DEFAULT_GOP); + this(DEFAULT_GOP); + } + + /** + * Use defaults for everything + */ + public BAQ(final double gapOpenPenalty) { + cd = convertFromPhredScale(gapOpenPenalty); initializeCachedData(); } + + /** * Create a new HmmGlocal object with specified parameters * diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java new file mode 100644 index 000000000..4589ffb71 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.utils.baq; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.BAQMode; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Applies Heng's BAQ calculation to a stream of incoming reads + */ +public class BAQReadTransformer extends ReadTransformer { + private BAQ baqHMM; + private IndexedFastaSequenceFile refReader; + private BAQ.CalculationMode cmode; + private BAQ.QualityMode qmode; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + final BAQMode mode = WalkerManager.getWalkerAnnotation(walker, BAQMode.class); + this.refReader = engine.getReferenceDataSource().getReference(); + this.cmode = engine.getArguments().BAQMode; + this.qmode = mode.QualityMode(); + baqHMM = new BAQ(engine.getArguments().BAQGOP); + + if ( qmode == BAQ.QualityMode.DONT_MODIFY ) + throw new ReviewedStingException("BUG: shouldn't create BAQ transformer with quality mode DONT_MODIFY"); + + if ( mode.ApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN && enabled() ) + throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + cmode + " was requested."); + + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return cmode != BAQ.CalculationMode.OFF; + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + baqHMM.baqRead(read, refReader, cmode, qmode); + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java deleted file mode 100644 index adfeef518..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.utils.baq; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Iterator; - -/** - * Simple iterator that applies Heng's BAQ calculation to a stream of incoming reads - */ -public class BAQSamIterator implements StingSAMIterator { - private final StingSAMIterator it; - private final BAQ baqHMM = new BAQ(); // creates a BAQ creator with default parameters - private final IndexedFastaSequenceFile refReader; - private final BAQ.CalculationMode cmode; - private final BAQ.QualityMode qmode; - - /** - * Creates a new BAMSamIterator using the reference getter refReader and applies the BAM to the reads coming - * in from it. See BAQ docs for baqType information. - * - * @param refReader - * @param it - * @param cmode - * @param qmode - */ - @Requires({ - "refReader != null", - "it != null", - "cmode != null" , - "qmode != null"}) - public BAQSamIterator(IndexedFastaSequenceFile refReader, StingSAMIterator it, BAQ.CalculationMode cmode, BAQ.QualityMode qmode) { - if ( cmode == BAQ.CalculationMode.OFF ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with calculation mode OFF"); - if ( qmode == BAQ.QualityMode.DONT_MODIFY ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with quailty mode DONT_MODIFY"); - - this.refReader = refReader; - this.it = it; - this.cmode = cmode; - this.qmode = qmode; - } - - @Requires("hasNext()") - @Ensures("result != null") - public SAMRecord next() { - //System.out.printf("BAQing during input%n"); - SAMRecord read = it.next(); - baqHMM.baqRead(read, refReader, cmode, qmode); - return read; - } - - public boolean hasNext() { return this.it.hasNext(); } - public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - public void close() { it.close(); } - public Iterator iterator() { return this; } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java new file mode 100644 index 000000000..028e75226 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.utils.baq; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Iterator; + +/** + * Iterator that applies a ReadTransformer to a stream of reads + */ +public class ReadTransformingIterator implements StingSAMIterator { + private final StingSAMIterator it; + private final ReadTransformer transformer; + + /** + * Creates a new ReadTransforming iterator + */ + @Requires({"it != null", "engine != null", "transformer != null", "transformer.isInitialized()"}) + public ReadTransformingIterator(final StingSAMIterator it, final ReadTransformer transformer) { + if ( ! transformer.isInitialized() ) + throw new IllegalStateException("Creating a read transformer stream for an uninitialized read transformer: " + transformer); + if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN ) + throw new IllegalStateException("Creating a read transformer stream for a forbidden transformer " + transformer); + + this.it = it; + this.transformer = transformer; + } + + @Requires("hasNext()") + @Ensures("result != null") + public SAMRecord next() { + final GATKSAMRecord read = (GATKSAMRecord)it.next(); + return transformer.apply(read); + } + + public boolean hasNext() { return this.it.hasNext(); } + public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } + public void close() { it.close(); } + public Iterator iterator() { return this; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java new file mode 100644 index 000000000..431014032 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface BQSRMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java new file mode 100644 index 000000000..fae0e8c09 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java @@ -0,0 +1,40 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * A ReadTransformer that applies BQSR on the fly to reads + * + * User: rpoplin + * Date: 2/13/12 + */ +public class BQSRReadTransformer extends ReadTransformer { + private boolean enabled; + private BaseRecalibration bqsr; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + this.enabled = engine.hasBaseRecalibration(); + this.bqsr = engine.getBaseRecalibration(); + final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return enabled; + } + + /** + * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. + */ + @Override + public GATKSAMRecord apply(GATKSAMRecord read) { + bqsr.recalibrateRead(read); + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java deleted file mode 100644 index 048f8e58c..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java +++ /dev/null @@ -1,50 +0,0 @@ -package org.broadinstitute.sting.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 2/13/12 - */ - -public class BQSRSamIterator implements StingSAMIterator { - private final StingSAMIterator it; - private final BaseRecalibration bqsr; - - /** - * Creates a new BQSRSamIterator and applies BQSR on the fly to incoming reads. - * - * @param it The incoming SamIterator to wrap - * @param bqsr The object which holds the BQSR table information and knows how to apply it - */ - @Requires({ - "it != null", - "bqsr != null"}) - public BQSRSamIterator(StingSAMIterator it, BaseRecalibration bqsr) { - if ( bqsr == null ) throw new ReviewedStingException("BUG: shouldn't create BQSRSamIterator with null recalibration object"); - - this.it = it; - this.bqsr = bqsr; - } - - @Requires("hasNext()") - @Ensures("result != null") - public SAMRecord next() { - SAMRecord read = it.next(); - bqsr.recalibrateRead((GATKSAMRecord) read); - return read; - } - - public boolean hasNext() { return this.it.hasNext(); } - public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - public void close() { it.close(); } - public Iterator iterator() { return this; } -} From cf91d894e4c17d9a7af17abc1bdadecf3443e5bf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 11:56:40 -0400 Subject: [PATCH 37/48] Fix build problems with tests --- .../utils/baq/ReadTransformingIterator.java | 2 +- .../reads/DownsamplerBenchmark.java | 23 ++++++++--------- .../reads/SAMDataSourceUnitTest.java | 24 ++++++------------ .../LocusIteratorByStateUnitTest.java | 25 +++++++++---------- 4 files changed, 31 insertions(+), 43 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java index 028e75226..18ab9e01a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java @@ -19,7 +19,7 @@ public class ReadTransformingIterator implements StingSAMIterator { /** * Creates a new ReadTransforming iterator */ - @Requires({"it != null", "engine != null", "transformer != null", "transformer.isInitialized()"}) + @Requires({"it != null", "transformer != null", "transformer.isInitialized()"}) public ReadTransformingIterator(final StingSAMIterator it, final ReadTransformer transformer) { if ( ! transformer.isInitialized() ) throw new IllegalStateException("Creating a read transformer stream for an uninitialized read transformer: " + transformer); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 477b76e37..5aeb741ec 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -36,8 +36,8 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.baq.BAQ; import java.util.Collections; import java.util.Iterator; @@ -69,18 +69,15 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { for(int i = 0; i < reps; i++) { SAMFileReader reader = new SAMFileReader(inputFile); ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(inputFile,new Tags())), - reader.getFileHeader(), - false, - SAMFileReader.ValidationStringency.SILENT, - downsampling.create(), - new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), - Collections.emptyList(), - false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR - (byte)0); + reader.getFileHeader(), + false, + SAMFileReader.ValidationStringency.SILENT, + downsampling.create(), + new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte)0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index f2c546317..730b3f410 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -24,9 +24,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; -import static org.testng.Assert.fail; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMProgramRecord; @@ -35,24 +32,25 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; - import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; -import java.util.Iterator; +import java.util.Collections; import java.util.List; +import static org.testng.Assert.*; + /** * @author aaron * @version 1.0 @@ -183,11 +181,8 @@ public class SAMDataSourceUnitTest extends BaseTest { null, new ValidationExclusion(), new ArrayList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1, removeProgramRecords); @@ -205,11 +200,8 @@ public class SAMDataSourceUnitTest extends BaseTest { null, new ValidationExclusion(), new ArrayList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1, removeProgramRecords); diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 4480acacd..fbc063ab6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -4,25 +4,27 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** * testing of the LocusIteratorByState @@ -349,11 +351,8 @@ public class LocusIteratorByStateUnitTest extends BaseTest { null, new ValidationExclusion(), Collections.emptyList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1 ); } From e028901d54d07330a65da9a9bff739e1e6f36f32 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 13:40:33 -0400 Subject: [PATCH 38/48] Fixed bad contract in ReadTransformer --- .../broadinstitute/sting/gatk/iterators/ReadTransformer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java index d307789f3..28348ecc2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java @@ -47,7 +47,7 @@ abstract public class ReadTransformer { * @param engine the engine, for initializing values * @param walker the walker we intend to run */ - @Requires({"initialized == false", "engine != null", "walker == null"}) + @Requires({"initialized == false", "engine != null", "walker != null"}) @Ensures("initialized == true") public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) { if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); From 27ddebee53e7d6b808c82dec5dd8849cd5014dd0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 13:41:03 -0400 Subject: [PATCH 39/48] Protect PrintReads from strange state from TraverseReadsUnitTests --- .../broadinstitute/sting/gatk/walkers/PrintReads.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index dca23ae66..a5d4b45b6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -41,10 +41,7 @@ import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; -import java.util.Collection; -import java.util.Random; -import java.util.Set; -import java.util.TreeSet; +import java.util.*; /** * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. @@ -141,6 +138,7 @@ public class PrintReads extends ReadWalker impleme public boolean simplifyReads = false; + List readTransformers = Collections.emptyList(); private TreeSet samplesToChoose = new TreeSet(); private boolean SAMPLES_SPECIFIED = false; @@ -153,6 +151,9 @@ public class PrintReads extends ReadWalker impleme if ( platform != null ) platform = platform.toUpperCase(); + if ( getToolkit() != null ) + readTransformers = getToolkit().getReadTransformers(); + Collection samplesFromFile; if (!sampleFile.isEmpty()) { samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile); @@ -226,7 +227,7 @@ public class PrintReads extends ReadWalker impleme public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { GATKSAMRecord workingRead = readIn; - for ( final ReadTransformer transformer : getToolkit().getReadTransformers() ) { + for ( final ReadTransformer transformer : readTransformers ) { if ( logger.isDebugEnabled() ) logger.debug("Applying transformer " + transformer + " to read " + readIn.getReadName()); workingRead = transformer.apply(workingRead); } From c9ea213c9bc1de56180a727f6e532b94c8cb4408 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 13:42:29 -0400 Subject: [PATCH 40/48] Make BaseRecalibration thread-safe -- In the process uncovered two strange things 1 -- qualityScoreByFullCovariateKey was created but never used. Seems like a cache? 2 -- Discovered nasty bug in BaseRecalibrator: https://jira.broadinstitute.org/browse/GSA-534 --- .../recalibration/BaseRecalibration.java | 34 ++++++++++++++----- .../utils/recalibration/ReadCovariates.java | 13 +++++++ 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index a563b18fc..0af7deec4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -27,12 +27,11 @@ package org.broadinstitute.sting.utils.recalibration; import net.sf.samtools.SAMTag; import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; @@ -46,7 +45,6 @@ import java.io.File; public class BaseRecalibration { private final static int MAXIMUM_RECALIBRATED_READ_LENGTH = 5000; - private final ReadCovariates readCovariates; private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) private final RecalibrationTables recalibrationTables; @@ -56,10 +54,23 @@ public class BaseRecalibration { private final int preserveQLessThan; private final boolean emitOriginalQuals; - private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. - static { - for (int i = 0; i < EventType.values().length; i++) - qualityScoreByFullCovariateKey[i] = new NestedHashMap(); + // TODO -- was this supposed to be used somewhere? +// private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. +// static { +// for (int i = 0; i < EventType.values().length; i++) +// qualityScoreByFullCovariateKey[i] = new NestedHashMap(); +// } + + /** + * Thread local cache to allow multi-threaded use of this class + */ + private ThreadLocal readCovariatesCache; + { + readCovariatesCache = new ThreadLocal () { + @Override protected ReadCovariates initialValue() { + return new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); + } + }; } /** @@ -81,7 +92,6 @@ public class BaseRecalibration { else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. quantizationInfo.quantizeQualityScores(quantizationLevels); - readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); this.disableIndelQuals = disableIndelQuals; this.preserveQLessThan = preserveQLessThan; this.emitOriginalQuals = emitOriginalQuals; @@ -104,6 +114,11 @@ public class BaseRecalibration { } // Compute all covariates for the read + // TODO -- the need to clear here suggests there's an error in the indexing / assumption code + // TODO -- for BI and DI. Perhaps due to the indel buffer size on the ends of the reads? + // TODO -- the output varies with -nt 1 and -nt 2 if you don't call clear here + // TODO -- needs to be fixed. + final ReadCovariates readCovariates = readCovariatesCache.get().clear(); RecalUtils.computeCovariates(read, requestedCovariates, readCovariates); for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings @@ -130,6 +145,7 @@ public class BaseRecalibration { } } + /** * Implements a serial recalibration of the reads using the combinational table. * First, we perform a positional recalibration, and then a subsequent dinuc correction. @@ -147,7 +163,7 @@ public class BaseRecalibration { * @param errorModel the event type * @return A recalibrated quality score as a byte */ - protected byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) { + private byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) { final byte qualFromRead = (byte)(long)key[1]; final double globalDeltaQ = calculateGlobalDeltaQ(recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE), key, errorModel); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index c86bd4deb..2b682f84b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.recalibration; +import java.util.Arrays; + /** * The object temporarily held by a read that describes all of it's covariates. * @@ -21,6 +23,17 @@ public class ReadCovariates { currentCovariateIndex = index; } + /** + * Necessary due to bug in BaseRecalibration recalibrateRead function. It is clearly seeing space it's not supposed to + * @return + */ + public ReadCovariates clear() { + for ( int i = 0; i < keys.length; i++ ) + for ( int j = 0; j < keys[i].length; j++) + Arrays.fill(keys[i][j], 0); + return this; + } + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; From 5ea7cd6dcc612e8e284a4faaccc0222302565e0f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 31 Aug 2012 14:01:54 -0400 Subject: [PATCH 41/48] Updating resource bundle: no reason to include both genotype and sites files for Omni and HM3, sites are enough. Also, don't include duplicate entry for the Mills indels. --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 3dc953361..08496e284 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -125,17 +125,17 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_135_b37.leftAligned.vcf", "dbsnp_135", b37, true, false)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_genotypes_1525_samples.b37.vcf", - "1000G_omni2.5", b37, true, true)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", + "1000G_omni2.5", b37, true, false)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf", - "hapmap_3.3", b37, true, true)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf", + "hapmap_3.3", b37, true, false)) addResource(new Resource("/humgen/1kg/DCC/ftp/technical/working/20120312_phase1_v2_indel_cleaned_sites_list/ALL.wgs.phase1_release_v2.20101123.official_indel_calls.20120312.sites.vcf", "1000G_phase1.indels", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", - "Mills_and_1000G_gold_standard.indels", b37, true, true)) + "Mills_and_1000G_gold_standard.indels", b37, true, false)) // // example call set for wiki tutorial From 277ba94c7bff86ac6c67955e64b313b4f0e50707 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 31 Aug 2012 14:06:29 -0400 Subject: [PATCH 42/48] Update from dbsnp135 to dbsnp137. --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 08496e284..5e66520ca 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -122,8 +122,8 @@ class GATKResourcesBundle extends QScript { // // standard VCF files. Will be lifted to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_135_b37.leftAligned.vcf", - "dbsnp_135", b37, true, false)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_137_b37.leftAligned.vcf", + "dbsnp_137", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", "1000G_omni2.5", b37, true, false)) From 1b0ce511a61bc6d1906e6817bc376d6851920f7e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 19:50:20 -0400 Subject: [PATCH 43/48] Updating BQSR tests due to my change to reset BQSR calibration data --- .../sting/gatk/walkers/bqsr/BQSRIntegrationTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index bd75806dd..85615962c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -127,9 +127,9 @@ public class BQSRIntegrationTest extends WalkerTest { @DataProvider(name = "PRTest") public Object[][] createPRTestData() { return new Object[][]{ - {new PRTest("", "d2d6ed8667cdba7e56f5db97d6262676")}, - {new PRTest(" -qq -1", "b7053d3d67aba6d8892f0a60f0ded338")}, - {new PRTest(" -qq 6", "bfbf0855185b2b70aa35237fb71e4487")}, + {new PRTest("", "1532242f9fe90ef759a0faa5d85f61fb")}, + {new PRTest(" -qq -1", "3dd2c87915c96ac55c3872026574d8cb")}, + {new PRTest(" -qq 6", "5d012ee224f1cb4a7afac59e3655e20c")}, {new PRTest(" -DIQ", "66aa65223f192ee39c1773aa187fd493")} }; } From 52d6bea8045c2f83124c31fecd83409f7ac8dc9b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 1 Sep 2012 11:08:36 -0400 Subject: [PATCH 44/48] a few more useful git ignores --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 8623fa076..456794cea 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,10 @@ queueScatterGather /bar* integrationtests/ public/testdata/onTheFlyOutputTest.vcf +private/testdata/onTheFlyOutputTest.vcf +lib +html +gatkdocs +dist +build +resources From 0892f2b8b2196a779bb9eb433b73854168c3fb3b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Sep 2012 20:18:56 -0400 Subject: [PATCH 45/48] Closing GSA-287:LocusReferenceView doesn't do very well in the case where contigs land off the end of the reference -- Confirmed that reads spanning off the end of the chromosome don't cause an exception by adding integration test for a single read that starts 7 bases from the end of chromosome 1 and spans 90 bases or so off. Added pileup integration test to ensure this behavior continues to work --- .../walkers/PileupWalkerIntegrationTest.java | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 9d9b91872..667b325ed 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -5,15 +5,7 @@ import org.testng.annotations.Test; import java.util.Arrays; -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Dec 1, 2009 - * Time: 9:03:34 AM - * To change this template use File | Settings | File Templates. - */ public class PileupWalkerIntegrationTest extends WalkerTest { - @Test public void testGnarleyFHSPileup() { String gatk_args = "-T Pileup -I " + validationDataLocation + "FHS_Pileup_Test.bam " @@ -23,4 +15,14 @@ public class PileupWalkerIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(expected_md5)); executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } + + @Test + public void testSingleReadAligningOffChromosome1() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.bam" + + " -R " + b37KGReference + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("4a45fe1f85aaa8c4158782f2b6dee2bd")); + executeTest("Testing single read spanning off chromosome 1", spec); + } } From c9944d81ef935223efd10643643be33f13ae0b06 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 4 Sep 2012 10:33:37 -0400 Subject: [PATCH 46/48] Skip array needs to also be used in the updateDataForRead function of the delocalized BQSR. --- .../bqsr/AdvancedRecalibrationEngine.java | 74 ++++++++++--------- .../walkers/bqsr/RecalibrationEngine.java | 2 +- .../bqsr/StandardRecalibrationEngine.java | 2 +- 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index e5c952b76..ff1754a10 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -106,47 +106,49 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp } @Override - public synchronized void updateDataForRead(final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public synchronized void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - final ReadCovariates readCovariates = covariateKeySetFrom(read); + if( !skip[offset] ) { + final ReadCovariates readCovariates = covariateKeySetFrom(read); - tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; - tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset]; - tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset]; + tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; + tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset]; + tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset]; - for (final EventType eventType : EventType.values()) { - final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; - final byte qual = tempQualArray[eventIndex]; - final double isError = tempFractionalErrorArray[eventIndex]; + for (final EventType eventType : EventType.values()) { + final int[] keys = readCovariates.getKeySet(offset, eventType); + final int eventIndex = eventType.index; + final byte qual = tempQualArray[eventIndex]; + final double isError = tempFractionalErrorArray[eventIndex]; - final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); - - final NestedIntegerArray qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(1.0, isError); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); + final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); + final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); + final RecalDatum rgThisDatum = createDatumObject(qual, isError); + if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it + rgRecalTable.put(rgThisDatum, keys[0], eventIndex); else - covPreviousDatum.increment(1.0, isError); + rgPreviousDatum.combine(rgThisDatum); + + final NestedIntegerArray qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); + final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); + if (qualPreviousDatum == null) + qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); + else + qualPreviousDatum.increment(1.0, isError); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); + final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); + if (covPreviousDatum == null) + covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); + else + covPreviousDatum.increment(1.0, isError); + } } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index ab65c1462..ce60f5a3a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -35,5 +35,5 @@ public interface RecalibrationEngine { public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase); - public void updateDataForRead(final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors); + public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 76a82a134..2b0f8ca80 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -93,7 +93,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP } @Override - public synchronized void updateDataForRead( final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public synchronized void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { throw new UnsupportedOperationException("Delocalized BQSR is not available in the GATK-lite version"); } From d7954372020086206fb226eb620031c7a5c71b9c Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Tue, 4 Sep 2012 16:41:44 -0400 Subject: [PATCH 47/48] - New UserExceptions added for when ReadFilters or Walkers specified on the command line are not found. When -rf xxxx cannot find the class corresponding to xxxx, all read filters are printed in a better formatted way, with links to their gatk docs. - VariantAnnotatorEngine changed to call genotype annotations even if pilups and allele -> likelihood mappings are not present. Current genotype annotations altered to check for null pilupes and null mappings. --- .../sting/gatk/filters/FilterManager.java | 30 ++++++++++---- .../annotator/AlleleBalanceBySample.java | 3 ++ .../annotator/DepthPerAlleleBySample.java | 2 +- .../annotator/MappingQualityZeroBySample.java | 2 +- .../annotator/VariantAnnotatorEngine.java | 14 +++---- .../utils/classloader/PluginManager.java | 14 ++++++- .../sting/utils/exceptions/UserException.java | 12 ++++++ .../InvalidArgumentIntegrationTest.java | 41 +++++++++++++++++++ 8 files changed, 97 insertions(+), 21 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index bddfa6a0d..5ca8a1779 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -29,6 +29,7 @@ import com.google.common.base.Function; import com.google.common.collect.Collections2; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.help.GATKDocUtils; import java.util.Collection; import java.util.List; @@ -68,16 +69,29 @@ public class FilterManager extends PluginManager { @Override protected String formatErrorMessage(String pluginCategory, String pluginName) { List> availableFilters = this.getPluginsImplementing(ReadFilter.class); - Collection availableFilterNames = Collections2.transform(availableFilters, new Function,String>(){ - @Override - public String apply(final Class input) { - return getName(input); - } - }); - return String.format("Read filter %s not found. Available read filters:%n%s.%n%n%s",pluginName, - Utils.join(String.format(", "),availableFilterNames), + return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, + userFriendlyListofReadFilters(availableFilters), "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information."); } + + private String userFriendlyListofReadFilters(List> filters) { + final String headName = "FilterName", headDoc = "Documentation"; + int longestNameLength = -1; + for ( Class < ? extends ReadFilter> filter : filters ) { + longestNameLength = Math.max(longestNameLength,this.getName(filter).length()); + } + String format = " %"+longestNameLength+"s %s%n"; + + StringBuilder listBuilder = new StringBuilder(); + listBuilder.append(String.format(format,headName,headDoc)); + for ( Class filter : filters ) { + String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter); + String filterName = this.getName(filter); + listBuilder.append(String.format(format,filterName,helpLink)); + } + + return listBuilder.toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 0104f24d9..1e1f65333 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -33,6 +33,9 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ + if ( stratifiedContext == null ) + return; + Double ratio = annotateSNP(stratifiedContext, vc, g); if (ratio == null) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 85387f7cf..ee9b51b56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -54,7 +54,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() ) + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) return; if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index 354b798bb..44657a7e7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -55,7 +55,7 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation { final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ - if ( g == null || !g.isCalled() ) + if ( g == null || !g.isCalled() || stratifiedContext == null ) return; int mq0 = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 22ec5468f..eae13e1b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -300,16 +300,12 @@ public class VariantAnnotatorEngine { if (stratifiedPerReadAlleleLikelihoodMap != null) perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if ( context == null && perReadAlleleLikelihoodMap == null) { - // no likelihoods nor pileup available: just move on to next sample - genotypes.add(genotype); - } else { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); - } - genotypes.add(gb.make()); + + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); } + genotypes.add(gb.make()); } return genotypes; diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 9f1b6db93..82fb6b8d6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.utils.classloader; import ch.qos.logback.classic.Level; import ch.qos.logback.classic.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -276,8 +278,16 @@ public class PluginManager { */ public PluginType createByName(String pluginName) { Class plugin = pluginsByName.get(pluginName); - if( plugin == null ) - throw new UserException(formatErrorMessage(pluginCategory,pluginName)); + if( plugin == null ) { + String errorMessage = formatErrorMessage(pluginCategory,pluginName); + if ( this.getClass().isAssignableFrom(FilterManager.class) ) { + throw new UserException.MalformedReadFilterException(errorMessage); + } else if ( this.getClass().isAssignableFrom(WalkerManager.class) ) { + throw new UserException.MalformedWalkerArgumentsException(errorMessage); + } else { + throw new UserException.CommandLineException(errorMessage); + } + } try { return plugin.newInstance(); } catch (Exception e) { diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 3130469e5..47a2f2f1d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -63,6 +63,18 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedReadFilterException extends CommandLineException { + public MalformedReadFilterException(String message) { + super(String.format("Malformed read filter: %s",message)); + } + } + + public static class MalformedWalkerArgumentsException extends CommandLineException { + public MalformedWalkerArgumentsException(String message) { + super(String.format("Malformed walker argument: %s",message)); + } + } + public static class MalformedGenomeLoc extends UserException { public MalformedGenomeLoc(String message, GenomeLoc loc) { super(String.format("Badly formed genome loc: %s: %s", message, loc)); diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java new file mode 100644 index 000000000..924c6ec5a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/31/12 + * Time: 11:03 AM + * To change this template use File | Settings | File Templates. + */ +public class InvalidArgumentIntegrationTest extends WalkerTest { + private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf"; + + private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) { + return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s " + flag + " " + arg, + 1, exeption); + + } + + @Test + public void testUnknownReadFilter() { + executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class)); + } + + @Test + public void testMalformedWalkerArgs() { + executeTest("MalformedWalkerArgs", + new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s ", + 1, UserException.MalformedWalkerArgumentsException.class)); + } +} From fc06f39411563691b405887cbb030fb8791ee4e9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 5 Sep 2012 09:55:34 -0400 Subject: [PATCH 48/48] Fixed docs for Pileup walker --- .../broadinstitute/sting/gatk/walkers/Pileup.java | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 2a6ecdb8c..52c6e1560 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -45,19 +45,8 @@ import java.util.Collections; import java.util.List; /** - * Prints the alignment in the pileup format. In the pileup format, each line represents a genomic position, - * consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping - * qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all - * encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand, - * a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the - * reverse strand. - * - * A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next - * reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence. - * Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. - * Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read - * separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality. - * A symbol '$' marks the end of a read segment. + * Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, + * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. * * Associated command: * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate]