diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 4da2e1179..ce6aa32f4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -184,6 +184,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false) protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000; + /** + * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling + * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the + * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking + * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, + * and may make use of them in assembly and calling, where possible. + */ + @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false) + protected boolean includeUnmappedReads = false; + @Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false) protected boolean USE_ALLELES_TRIGGER = false; @@ -354,11 +364,20 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // enable non primary and extended reads in the active region @Override public EnumSet desiredReadStates() { - return EnumSet.of( - ActiveRegionReadState.PRIMARY, - ActiveRegionReadState.NONPRIMARY, - ActiveRegionReadState.EXTENDED - ); + if ( includeUnmappedReads ) { + throw new UserException.BadArgumentValue("includeUmappedReads", "is not yet functional"); +// return EnumSet.of( +// ActiveRegionReadState.PRIMARY, +// ActiveRegionReadState.NONPRIMARY, +// ActiveRegionReadState.EXTENDED, +// ActiveRegionReadState.UNMAPPED +// ); + } else + return EnumSet.of( + ActiveRegionReadState.PRIMARY, + ActiveRegionReadState.NONPRIMARY, + ActiveRegionReadState.EXTENDED + ); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java index f5018db8c..ab539c9dc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java @@ -32,8 +32,7 @@ import org.broadinstitute.sting.utils.NGSPlatform; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * Easy to use creator of artificial BAM files for testing @@ -64,6 +63,8 @@ public class ArtificialBAMBuilder { int readLength = 10; private final ArrayList samples = new ArrayList(); + private LinkedList additionalReads = new LinkedList(); + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); { factory.setCreateIndex(true); @@ -118,6 +119,14 @@ public class ArtificialBAMBuilder { return this; } + public void addReads(final GATKSAMRecord readToAdd) { + additionalReads.add(readToAdd); + } + + public void addReads(final Collection readsToAdd) { + additionalReads.addAll(readsToAdd); + } + public List getSamples() { return samples; } @@ -145,6 +154,11 @@ public class ArtificialBAMBuilder { } } + if ( ! additionalReads.isEmpty() ) { + reads.addAll(additionalReads); + Collections.sort(reads, new SAMRecordCoordinateComparator()); + } + return reads; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 15d4eec2d..319af5ec5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -501,6 +501,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return providers; } + // --------------------------------------------------------------------------------------------------------- + // + // Combinatorial tests to ensure reads are going into the right regions + // + // --------------------------------------------------------------------------------------------------------- + @DataProvider(name = "CombinatorialARTTilingProvider") public Object[][] makeCombinatorialARTTilingProvider() { final List tests = new LinkedList(); @@ -582,7 +588,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } - @Test(enabled = true, dataProvider = "CombinatorialARTTilingProvider") + @Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider") public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet readStates, final ArtificialBAMBuilder bamBuilder) { logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder); final List intervals = Arrays.asList( @@ -597,10 +603,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary for ( final ActiveRegion region : activeRegionsMap.values() ) { + final Set readNamesInRegion = readNamesInRegion(region); int nReadsExpectedInRegion = 0; for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); - final Set readNamesInRegion = readNamesInRegion(region); boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED) ? region.getExtendedLoc().overlapsP(readLoc) @@ -629,4 +635,53 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { readNames.add(read.getReadName()); return readNames; } + + // --------------------------------------------------------------------------------------------------------- + // + // Make sure all insertion reads are properly included in the active regions + // + // --------------------------------------------------------------------------------------------------------- + + @Test + public void ensureAllInsertionReadsAreInActiveRegions() { + + final int readLength = 10; + final int start = 20; + final int nReadsPerLocus = 10; + final int nLoci = 3; + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setAlignmentStart(start); + + // note that the position must be +1 as the read's all I cigar puts the end 1 bp before start, leaving it out of the region + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(bamBuilder.getHeader(),"allI",0,start+1,readLength); + allI.setCigarString(readLength + "I"); + allI.setReadGroup(new GATKSAMReadGroupRecord(bamBuilder.getHeader().getReadGroups().get(0))); + + bamBuilder.addReads(allI); + + final GenomeLocSortedSet activeRegions = new GenomeLocSortedSet(bamBuilder.getGenomeLocParser()); + activeRegions.add(bamBuilder.getGenomeLocParser().createGenomeLoc("1", 10, 30)); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions); + + final TraverseActiveRegions traversal = new TraverseActiveRegions(); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + + final ActiveRegion region = activeRegionsMap.values().iterator().next(); + int nReadsExpectedInRegion = 0; + + final Set readNamesInRegion = readNamesInRegion(region); + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + Assert.assertTrue(readNamesInRegion.contains(read.getReadName()), + "Region " + region + " should contain read " + read + " with cigar " + read.getCigarString() + " but it wasn't"); + nReadsExpectedInRegion++; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } }