diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index e4dcaec54..957eb1aba 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -314,10 +314,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat } private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { - // ignore reduced reads because they are always on the forward strand! - // TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test - if ( read.isReducedRead() ) - return; final boolean matchesRef = allele.equals(ref, true); final boolean matchesAlt = allele.equals(alt, true); @@ -326,12 +322,17 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat final int row = matchesRef ? 0 : 1; if ( read.isStrandless() ) { - // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 - // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even - // if the read is only seen once, because it's a merged read or other) - final int toAdd = Math.max(representativeCount / 2, 1); - table[row][0] += toAdd; - table[row][1] += toAdd; + + // ignore strandless reduced reads because they are always on the forward strand! + if ( !read.isReducedRead() ) { + + // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 + // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even + // if the read is only seen once, because it's a merged read or other) + final int toAdd = Math.max(representativeCount / 2, 1); + table[row][0] += toAdd; + table[row][1] += toAdd; + } } else { // a normal read with an actual strand final boolean isFW = !read.getReadNegativeStrandFlag(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index c3a0618ef..dd57c8ac6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -53,6 +53,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -88,10 +89,12 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn int deletions = 0; int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - final ReadBackedPileup pileup = context.getBasePileup(); - deletions += pileup.getNumberOfDeletions(); - depth += pileup.getNumberOfElements(); + for ( final PileupElement p : sample.getValue().getBasePileup() ) { + final int actualSampleDepth = p.getRepresentativeCount(); + depth += actualSampleDepth; + if ( p.isDeletion() ) + deletions += actualSampleDepth; + } } Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 1cd9c1bc0..3532a74fb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -50,6 +50,10 @@ import it.unimi.dsi.fastutil.ints.IntArrayList; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + /** * The element that describes the header of the sliding window. @@ -264,29 +268,56 @@ public class HeaderElement { } /** - * Calculates the number of haplotypes necessary to represent this site. + * Calculates the number of alleles necessary to represent this site. * * @param minVariantProportion the minimum proportion to call a site variant. - * @return the number of alleles necessary to represent this site. + * @param allowDeletions should we allow deletions? + * @return the number of alleles necessary to represent this site or -1 if allowDeletions is false and there are a sufficient number of them */ - public int getNumberOfAlleles(final double minVariantProportion) { + public int getNumberOfAlleles(final double minVariantProportion, final boolean allowDeletions) { + final List alleles = getAlleles(minVariantProportion, allowDeletions); + return alleles == null ? -1 : alleles.size(); + } + + /** + * Calculates the alleles necessary to represent this site. + * + * @param minVariantProportion the minimum proportion to call a site variant. + * @param allowDeletions should we allow deletions? + * @return the list of alleles necessary to represent this site or null if allowDeletions is false and there are a sufficient number of them + */ + public List getAlleles(final double minVariantProportion, final boolean allowDeletions) { final int totalBaseCount = consensusBaseCounts.totalCount(); - if (totalBaseCount == 0) - return 0; + if ( totalBaseCount == 0 ) + return Collections.emptyList(); - final int minBaseCountForRelevantAlleles = (int)(minVariantProportion * totalBaseCount); + final int minBaseCountForRelevantAlleles = Math.max(1, (int)(minVariantProportion * totalBaseCount)); - int nAlleles = 0; - for ( BaseIndex base : BaseIndex.values() ) { + final List alleles = new ArrayList(4); + for ( final BaseIndex base : BaseIndex.values() ) { final int baseCount = consensusBaseCounts.countOfBase(base); - // don't consider this allele if the count is 0 - if ( baseCount == 0 ) - continue; - - if ( baseCount >= minBaseCountForRelevantAlleles ) - nAlleles++; + if ( baseCount >= minBaseCountForRelevantAlleles ) { + if ( !allowDeletions && base == BaseIndex.D ) + return null; + alleles.add(base); + } } - return nAlleles; + return alleles; + } + + /* + * Checks whether there are a significant number of softclips. + * + * @param minVariantProportion the minimum proportion to consider something significant. + * @return true if there are significant softclips, false otherwise + */ + public boolean hasSignificantSoftclips(final double minVariantProportion) { + final int totalBaseCount = consensusBaseCounts.totalCount(); + if ( totalBaseCount == 0 ) + return false; + + final int minBaseCountForSignificance = Math.max(1, (int)(minVariantProportion * totalBaseCount)); + return nSoftClippedBases >= minBaseCountForSignificance; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 2f377bee8..42873964d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -46,9 +46,11 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import com.google.java.contract.Ensures; import it.unimi.dsi.fastutil.objects.*; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -97,43 +99,62 @@ public class MultiSampleCompressor { final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy, - final boolean allowPolyploidReduction) { + final ReduceReads.DownsampleStrategy downsampleStrategy) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, allowPolyploidReduction)); + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } } - public ObjectSet addAlignment(GATKSAMRecord read) { + /** + * Add an alignment to the compressor + * + * @param read the read to be added + * @param knownSnpPositions the set of known SNP positions + * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) + */ + public ObjectSet addAlignment(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) { String sampleName = read.getReadGroup().getSample(); SingleSampleCompressor compressor = compressorsPerSample.get(sampleName); if ( compressor == null ) throw new ReviewedStingException("No compressor for sample " + sampleName); - Pair, CompressionStash> readsAndStash = compressor.addAlignment(read); + Pair, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions); ObjectSet reads = readsAndStash.getFirst(); CompressionStash regions = readsAndStash.getSecond(); - reads.addAll(closeVariantRegionsInAllSamples(regions)); + reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions)); return reads; } - public ObjectSet close() { + /** + * Properly closes the compressor. + * + * @param knownSnpPositions the set of known SNP positions + * @return A non-null set/list of all reads generated + */ + @Ensures("result != null") + public ObjectSet close(final ObjectSortedSet knownSnpPositions) { ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); for ( SingleSampleCompressor sample : compressorsPerSample.values() ) { - Pair, CompressionStash> readsAndStash = sample.close(); - reads = readsAndStash.getFirst(); + Pair, CompressionStash> readsAndStash = sample.close(knownSnpPositions); + reads.addAll(readsAndStash.getFirst()); } return reads; } - private ObjectSet closeVariantRegionsInAllSamples(CompressionStash regions) { + /** + * Finalizes current variant regions. + * + * @param knownSnpPositions the set of known SNP positions + * @return A non-null set/list of all reads generated + */ + private ObjectSet closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); if (!regions.isEmpty()) { for (SingleSampleCompressor sample : compressorsPerSample.values()) { - reads.addAll(sample.closeVariantRegions(regions)); + reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions)); } } return reads; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 62410d191..5e9429284 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -54,9 +54,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMProgramRecord; import net.sf.samtools.util.SequenceUtil; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -75,6 +73,10 @@ import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.Collections; +import java.util.List; /** @@ -147,10 +149,12 @@ public class ReduceReads extends ReadWalker, Redu private byte minTailQuality = 2; /** - * Allow the experimental polyploid-based reduction capabilities of this tool + * Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction. + * Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored. + * Note that polyploid ("het") compression will work only when a single SNP is present in a consensus window. */ - @Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false) - private boolean USE_POLYPLOID_REDUCTION = false; + @Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false) + public List> known = Collections.emptyList(); /** * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals @@ -249,6 +253,8 @@ public class ReduceReads extends ReadWalker, Redu ObjectSortedSet intervalList; + final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet(); + // IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam"; @@ -359,8 +365,22 @@ public class ReduceReads extends ReadWalker, Redu for (GATKSAMRecord mappedRead : mappedReads) System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd()); - return mappedReads; + // add the SNPs to the list of known positions + populateKnownSNPs(metaDataTracker); + return mappedReads; + } + + /* + * Add the positions of known SNPs to the set so that we can keep track of it + * + * @param metaDataTracker the ref meta data tracker + */ + protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) { + for ( final VariantContext vc : metaDataTracker.getValues(known) ) { + if ( vc.isSNP() ) + knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc)); + } } /** @@ -373,7 +393,7 @@ public class ReduceReads extends ReadWalker, Redu */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, USE_POLYPLOID_REDUCTION)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } /** @@ -405,7 +425,7 @@ public class ReduceReads extends ReadWalker, Redu if (debugLevel == 1) System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd()); - for (GATKSAMRecord compressedRead : stash.compress(readReady)) + for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions)) outputRead(compressedRead); // We only care about maintaining the link between read pairs if they are in the same variant @@ -422,6 +442,10 @@ public class ReduceReads extends ReadWalker, Redu firstRead = false; } + // reduce memory requirements by removing old positions + if ( !mappedReads.isEmpty() ) + clearStaleKnownPositions(mappedReads.get(0)); + return stash; } @@ -434,13 +458,38 @@ public class ReduceReads extends ReadWalker, Redu public void onTraversalDone(ReduceReadsStash stash) { // output any remaining reads in the compressor - for (GATKSAMRecord read : stash.close()) + for (GATKSAMRecord read : stash.close(knownSnpPositions)) outputRead(read); if (nwayout) writerToUse.close(); } + /** + * Removes known positions that are no longer relevant for use with het compression. + * + * @param read the current read, used for checking whether there are stale positions we can remove + */ + protected void clearStaleKnownPositions(final GATKSAMRecord read) { + // nothing to clear if empty + if ( knownSnpPositions.isEmpty() ) + return; + + // not ready to be cleared until we encounter a read from a different contig + final int contigIndexOfRead = read.getReferenceIndex(); + if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead ) + return; + + // because we expect most elements to be stale, it's not going to be efficient to remove them one at a time + final ObjectAVLTreeSet goodLocs = new ObjectAVLTreeSet(); + for ( final GenomeLoc loc : knownSnpPositions ) { + if ( loc.getContigIndex() == contigIndexOfRead ) + goodLocs.add(loc); + } + knownSnpPositions.clear(); + knownSnpPositions.addAll(goodLocs); + } + /** * Hard clips away all parts of the read that doesn't agree with the intervals selected. * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java index 0a446bab7..52c5f0903 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java @@ -46,6 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import it.unimi.dsi.fastutil.objects.ObjectSortedSet; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -106,11 +108,12 @@ public class ReduceReadsStash { /** * sends the read to the MultiSampleCompressor * - * @param read the read to be compressed + * @param read the read to be compressed + * @param knownSnpPositions the set of known SNP positions * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) */ - public Iterable compress(GATKSAMRecord read) { - return compressor.addAlignment(read); + public Iterable compress(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) { + return compressor.addAlignment(read, knownSnpPositions); } /** @@ -125,18 +128,19 @@ public class ReduceReadsStash { /** * Close the stash, processing all remaining reads in order * + * @param knownSnpPositions the set of known SNP positions * @return a list of all the reads produced by the SlidingWindow machinery) */ - public Iterable close() { + public Iterable close(final ObjectSortedSet knownSnpPositions) { LinkedList result = new LinkedList(); // compress all the stashed reads (in order) for (GATKSAMRecord read : outOfOrderReads) - for (GATKSAMRecord compressedRead : compressor.addAlignment(read)) + for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions)) result.add(compressedRead); // output any remaining reads from the compressor - for (GATKSAMRecord read : compressor.close()) + for (GATKSAMRecord read : compressor.close(knownSnpPositions)) result.add(read); return result; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 42db83c04..db1e0baaf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -46,7 +46,9 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import com.google.java.contract.Ensures; import it.unimi.dsi.fastutil.objects.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -64,7 +66,6 @@ public class SingleSampleCompressor { final private double minIndelProportionToTriggerVariant; final private int minBaseQual; final private ReduceReads.DownsampleStrategy downsampleStrategy; - final private boolean allowPolyploidReduction; private SlidingWindow slidingWindow; private int slidingWindowCounter; @@ -77,8 +78,7 @@ public class SingleSampleCompressor { final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy, - final boolean allowPolyploidReduction) { + final ReduceReads.DownsampleStrategy downsampleStrategy) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -87,10 +87,16 @@ public class SingleSampleCompressor { this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; this.minBaseQual = minBaseQual; this.downsampleStrategy = downsampleStrategy; - this.allowPolyploidReduction = allowPolyploidReduction; } - public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) { + /** + * Add an alignment to the compressor + * + * @param read the read to be added + * @param knownSnpPositions the set of known SNP positions + * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) + */ + public Pair, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions ) { ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); CompressionStash stash = new CompressionStash(); int readOriginalStart = read.getUnclippedStart(); @@ -101,14 +107,14 @@ public class SingleSampleCompressor { (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window // close the current sliding window - Pair, CompressionStash> readsAndStash = slidingWindow.close(); + Pair, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions); reads = readsAndStash.getFirst(); stash = readsAndStash.getSecond(); slidingWindow = null; // so we create a new one on the next if } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), allowPolyploidReduction); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); slidingWindowCounter++; } @@ -116,12 +122,26 @@ public class SingleSampleCompressor { return new Pair, CompressionStash>(reads, stash); } - public Pair, CompressionStash> close() { - return (slidingWindow != null) ? slidingWindow.close() : emptyPair; + /** + * Properly closes the compressor. + * + * @param knownSnpPositions the set of known SNP positions + * @return A non-null set/list of all reads generated + */ + @Ensures("result != null") + public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { + return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair; } - public ObjectSet closeVariantRegions(CompressionStash regions) { - return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions); + /** + * Finalizes current variant regions. + * + * @param knownSnpPositions the set of known SNP positions + * @return A non-null set/list of all reads generated + */ + @Ensures("result != null") + public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { + return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 11e023b9b..8a80c5570 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -57,6 +57,7 @@ import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; @@ -65,10 +66,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import java.util.Comparator; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.ListIterator; +import java.util.*; /** @@ -109,8 +107,6 @@ public class SlidingWindow { protected ReduceReads.DownsampleStrategy downsampleStrategy; private boolean hasIndelQualities; - private boolean allowPolyploidReductionInGeneral; - private static CompressionStash emptyRegions = new CompressionStash(); /** @@ -154,7 +150,7 @@ public class SlidingWindow { this.readsInWindow = new ObjectAVLTreeSet(); } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, boolean allowPolyploidReduction) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -188,8 +184,6 @@ public class SlidingWindow { this.downsampleStrategy = downsampleStrategy; this.hasIndelQualities = hasIndelQualities; - - this.allowPolyploidReductionInGeneral = allowPolyploidReduction; } /** @@ -403,12 +397,12 @@ public class SlidingWindow { * @param header the window header * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus - * @param isNegativeStrand should the synthetic read be represented as being on the negative strand? + * @param strandType the strandedness that the synthetic read should be represented as having * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated. */ @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") - protected ObjectArrayList addToSyntheticReads(LinkedList header, int start, int end, boolean isNegativeStrand) { + protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { ObjectArrayList reads = new ObjectArrayList(); if (start < end) { ListIterator headerElementIterator = header.listIterator(start); @@ -422,22 +416,22 @@ public class SlidingWindow { reads.addAll(finalizeAndAdd(ConsensusType.FILTERED)); int endOfConsensus = findNextNonConsensusElement(header, start, end); - addToRunningConsensus(header, start, endOfConsensus, isNegativeStrand); + addToRunningConsensus(header, start, endOfConsensus, strandType); if (endOfConsensus <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start)); - reads.addAll(addToSyntheticReads(header, endOfConsensus, end, isNegativeStrand)); + reads.addAll(addToSyntheticReads(header, endOfConsensus, end, strandType)); } else if (headerElement.hasFilteredData()) { reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); int endOfFilteredData = findNextNonFilteredDataElement(header, start, end); - reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand)); + reads.addAll(addToFilteredData(header, start, endOfFilteredData, strandType)); if (endOfFilteredData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start)); - reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, isNegativeStrand)); + reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, strandType)); } else if (headerElement.isEmpty()) { reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); @@ -446,7 +440,7 @@ public class SlidingWindow { if (endOfEmptyData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); - reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, isNegativeStrand)); + reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, strandType)); } else throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start)); @@ -558,16 +552,16 @@ public class SlidingWindow { * @param header the window header * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus - * @param isNegativeStrand should the synthetic read be represented as being on the negative strand? + * @param strandType the strandedness that the synthetic read should be represented as having * @return a non-null list of GATKSAMRecords representing finalized filtered consensus data. Empty list if no consensus was generated. */ @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") - private ObjectArrayList addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) { + private ObjectArrayList addToFilteredData(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { ObjectArrayList result = new ObjectArrayList(); if (filteredDataConsensus == null) - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType); ListIterator headerElementIterator = header.listIterator(start); for (int index = start; index < end; index++) { @@ -583,7 +577,7 @@ public class SlidingWindow { if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) { result.add(finalizeFilteredDataConsensus()); - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, isNegativeStrand); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, strandType); } genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); @@ -601,12 +595,12 @@ public class SlidingWindow { * @param header the window header * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus - * @param isNegativeStrand should the synthetic read be represented as being on the negative strand? + * @param strandType the strandedness that the synthetic read should be represented as having */ @Requires({"start >= 0 && (end >= start || end == 0)"}) - private void addToRunningConsensus(LinkedList header, int start, int end, boolean isNegativeStrand) { + private void addToRunningConsensus(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { if (runningConsensus == null) - runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand); + runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType); Iterator headerElementIterator = header.listIterator(start); for (int index = start; index < end; index++) { @@ -642,29 +636,39 @@ public class SlidingWindow { * * @param start the first window header index in the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here? + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here * @return a non-null list of all reads contained in the variant region */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected ObjectList compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { + protected ObjectList compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { ObjectList allReads = new ObjectArrayList(); // Try to compress into a polyploid consensus - int hetRefPosition = -1; - final Object[] header = windowHeader.toArray(); + // Optimization: don't bother if there are no known SNPs + final int hetRefPosition = knownSnpPositions.isEmpty() ? -1 : findSinglePolyploidCompressiblePosition(start, stop); - if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) - hetRefPosition = findSinglePolyploidCompressiblePosition(header, start, stop); + boolean successfullyCreatedPolyploidConsensus = false; - // Try to compress the variant region; note that using the hetRefPosition protects us from trying to compress - // variant regions that are created by insertions (since we can't confirm here that they represent the same allele) - if ( hetRefPosition != -1 ) { - allReads = createPolyploidConsensus(start, stop, ((HeaderElement) header[hetRefPosition]).getLocation()); + // Note that using the hetRefPosition protects us from trying to compress variant regions that are created by + // insertions (which we don't want because we can't confirm that they represent the same allele). + // Also, we only allow polyploid consensus creation at known sites. + if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) { + + // try to create the polyploid consensus + final ObjectList polyploidReads = createPolyploidConsensus(start, stop, hetRefPosition); + + // if successful we are good to go! + if ( polyploidReads != null ) { + allReads.addAll(polyploidReads); + successfullyCreatedPolyploidConsensus = true; + } } - // Return all reads that overlap the variant region and remove them from the window header entirely - // also remove all reads preceding the variant region (since they will be output as consensus right after compression - else { + + // if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them + // from the window header entirely; also remove all reads preceding the variant region (since they will be output + // as consensus right after compression) + if ( !successfullyCreatedPolyploidConsensus ) { final int refStart = windowHeader.get(start).getLocation(); final int refStop = windowHeader.get(stop).getLocation(); @@ -678,35 +682,50 @@ public class SlidingWindow { toRemove.add(read); } } - removeReadsFromWindow(toRemove); + + // remove all used reads + for ( final GATKSAMRecord read : toRemove ) + readsInWindow.remove(read); } return allReads; } + /** + * Determines whether the given position match one of the known sites + * + * @param targetPosition the position of the het site + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here + * @return true if the targetPosition matches a known SNP position, false otherwise + */ + @Requires({"targetPosition >= 1 && knownSnpPositions != null"}) + protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet knownSnpPositions) { + final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition); + return knownSnpPositions.contains(targetLoc); + } + /* * Finds the het variant position located within start and stop (inclusive) if one exists. * - * @param header the header element array * @param start the first header index in the region to check (inclusive) * @param stop the last header index of the region to check (inclusive) * @return the window header index of the single het position or -1 if either none or more than one exists */ - @Requires("header != null && start >= 0 && (stop >= start || stop == 0)") - protected int findSinglePolyploidCompressiblePosition(final Object[] header, final int start, final int stop) { + @Requires("start >= 0 && (stop >= start || stop == 0)") + protected int findSinglePolyploidCompressiblePosition(final int start, final int stop) { int hetRefPosition = -1; for ( int i = start; i <= stop; i++ ) { - final int nAlleles = ((HeaderElement) header[i]).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT); + final int nAlleles = windowHeader.get(i).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false); - // we will only work on diploid cases because we just don't want to handle/test other scenarios - if ( nAlleles > 2 ) + // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios + if ( nAlleles > 2 || nAlleles == -1 ) return -1; if ( nAlleles == 2 ) { // make sure that there is only 1 site in the region that contains more than one allele - if ( hetRefPosition >= 0 ) + if ( hetRefPosition != -1 ) return -1; hetRefPosition = i; @@ -716,21 +735,43 @@ public class SlidingWindow { return hetRefPosition; } + /* + * Checks whether there's a position in the header with a significant number of softclips. + * + * @param header the window header to examine + * @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument) + * @return true if there exists a position with significant softclips, false otherwise + */ + @Requires("header != null") + protected boolean hasSignificantSoftclipPosition(final List header, final int positionToSkip) { + + for ( final HeaderElement headerElement : header ) { + + if ( headerElement.getLocation() == positionToSkip ) + continue; + + if ( headerElement.hasSignificantSoftclips(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT) ) + return true; + } + + return false; + } + /** * Finalizes a variant region, any adjacent synthetic reads. * * @param start the first window header index in the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here? + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here * @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected ObjectList closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { - ObjectList allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition); + protected ObjectList closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { + ObjectList allReads = compressVariantRegion(start, stop, knownSnpPositions); ObjectList result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; - result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); + result.addAll(addToSyntheticReads(windowHeader, 0, stop+1, SyntheticRead.StrandType.STRANDLESS)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); return result; // finalized reads will be downsampled if necessary @@ -739,10 +780,11 @@ public class SlidingWindow { /* * Finalizes the list of regions requested (and any regions preceding them) * - * @param regions the list of regions to finalize + * @param regions the list of regions to finalize + * @param knownSnpPositions the set of known SNP positions * @return a non-null set of reduced reads representing the finalized regions */ - public ObjectSet closeVariantRegions(final CompressionStash regions) { + public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); if ( !regions.isEmpty() ) { @@ -754,7 +796,7 @@ public class SlidingWindow { final int start = region.getStart() - windowHeaderStart; final int stop = region.getStop() - windowHeaderStart; - allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track + allReads.addAll(closeVariantRegion(start, stop, knownSnpPositions)); // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions. // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads @@ -772,6 +814,7 @@ public class SlidingWindow { if ( lastCleanedElement != null && lastCleanedElement.hasInsertionToTheRight() ) windowHeader.addFirst(new HeaderElement(lastCleanedElement.getLocation(), lastCleanedElement.numInsertionsToTheRight())); } + return allReads; } @@ -804,10 +847,11 @@ public class SlidingWindow { * regions that still exist regardless of being able to fulfill the * context size requirement in the end. * + * @param knownSnpPositions the set of known SNP positions * @return A non-null set/list of all reads generated */ @Ensures("result != null") - public Pair, CompressionStash> close() { + public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { // mark variant regions ObjectSet finalizedReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); CompressionStash regions = new CompressionStash(); @@ -816,10 +860,10 @@ public class SlidingWindow { if (!windowHeader.isEmpty()) { markSites(getStopLocation(windowHeader) + 1); regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions); - finalizedReads = closeVariantRegions(regions); + finalizedReads = closeVariantRegions(regions, knownSnpPositions); if (!windowHeader.isEmpty()) { - finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false)); + finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS)); finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up } } @@ -863,86 +907,135 @@ public class SlidingWindow { return finalizedRead; } + // define this so that we can use Java generics below + private static class HeaderElementList extends LinkedList {} + /** - * Finalizes a variant region, any adjacent synthetic reads. + * Finalizes a variant region for point mutations, and any adjacent synthetic reads. Indel sites are not supported. * - * @param start the first window header index in the variant region (inclusive) + * @param start the first window header index of the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param hetRefPosition reference position (in global coordinates) of the het site - * @return a non-null list of all reads contained in the variant region as a polyploid consensus + * @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE! + * @return a list of all reads contained in the variant region as a polyploid consensus, or null if not possible */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - @Ensures("result != null") - private ObjectList createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) { - // we will create two (positive strand, negative strand) headers for each contig - ObjectList> headersPosStrand = new ObjectArrayList>(); - ObjectList> headersNegStrand = new ObjectArrayList>(); - ObjectList hetReads = new ObjectArrayList(); - Byte2IntMap haplotypeHeaderMap = new Byte2IntArrayMap(2); - int currentHaplotype = 0; - int refStart = windowHeader.get(start).getLocation(); - int refStop = windowHeader.get(stop).getLocation(); - ObjectList toRemove = new ObjectArrayList(); - for (GATKSAMRecord read : readsInWindow) { - int haplotype; + protected ObjectList createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) { + // we will create two (positive strand, negative strand) headers for each haplotype + final HeaderElementList[] headersPosStrand = new HeaderElementList[2]; + final HeaderElementList[] headersNegStrand = new HeaderElementList[2]; - // check if the read is either before or inside the variant region - if (read.getSoftStart() <= refStop) { - // check if the read is inside the variant region - if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) { - // check if the read contains the het site - if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) { - int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); - // TODO -- THIS IS A HUGE BUG AS IT WILL NOT WORK FOR DELETIONS; see commented out unit test - byte base = read.getReadBases()[readPos]; - byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos]; + final int refStart = windowHeader.get(start).getLocation(); + final int refStop = windowHeader.get(stop).getLocation(); + final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation(); - // check if base passes the filters! - if (qual >= MIN_BASE_QUAL_TO_COUNT) { - // check which haplotype this read represents and take the index of it from the list of headers - if (haplotypeHeaderMap.containsKey(base)) { - haplotype = haplotypeHeaderMap.get(base); - } - // create new lists if this haplotype has not been seen yet - else { - haplotype = currentHaplotype; - haplotypeHeaderMap.put(base, currentHaplotype); - headersPosStrand.add(new LinkedList()); - headersNegStrand.add(new LinkedList()); - currentHaplotype++; - } - LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype); - // add to the polyploid header + // initialize the mapping from base (allele) to header + final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2); + for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false) ) { + final int currentIndex = alleleHeaderMap.size(); + if ( currentIndex > 1 ) + throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus"); + + alleleHeaderMap.put(allele.b, currentIndex); + headersPosStrand[currentIndex] = new HeaderElementList(); + headersNegStrand[currentIndex] = new HeaderElementList(); + } + + // sanity check that we saw 2 alleles + if ( alleleHeaderMap.size() != 2 ) + throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size()); + + final ObjectList toRemoveFromReadCache = new ObjectArrayList(); + final ObjectList toRemoveFromHeader = new ObjectArrayList(); + + for ( final GATKSAMRecord read : readsInWindow ) { + + // if the read falls after the region, just skip it for now (we'll get to it later) + if ( read.getSoftStart() > refStop ) + continue; + + // if the read falls before the region, remove it + if ( read.getSoftEnd() < refStart ) { + toRemoveFromReadCache.add(read); + continue; + } + + // check whether the read spans the het site + if ( read.getSoftStart() <= globalHetRefPosition && read.getSoftEnd() >= globalHetRefPosition ) { + + // make sure it meets the minimum mapping quality requirement (if not, we'll remove it and not use it for the consensuses) + if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { + + // where on the read is the het position? + final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); + + // this is safe because indels are not supported + final byte base = read.getReadBases()[readPosOfHet]; + final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet]; + + // make sure that the base passes filters (if not, we'll remove it and not use it for the consensuses) + if ( qual >= MIN_BASE_QUAL_TO_COUNT ) { + + // check which allele this read represents + final Integer allele = alleleHeaderMap.get(base); + + // ignore the read if it represents a base that's not part of the consensus + if ( allele != null ) { + // add to the appropriate polyploid header + final LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; addToHeader(header, read); - // remove from the standard header so that we don't double count it - removeFromHeader(windowHeader, read); } } } - // we remove all reads before and inside the variant region from the window - toRemove.add(read); + // remove from the standard header so that we don't double count it + toRemoveFromHeader.add(read); } + + // we remove all reads falling inside the variant region from the window + toRemoveFromReadCache.add(read); } - for (LinkedList header : headersPosStrand) { - if (header.size() > 0) - hetReads.addAll(addToSyntheticReads(header, 0, header.size(), false)); - if (runningConsensus != null) - hetReads.add(finalizeRunningConsensus()); + // sanity check that no new "variant region" exists on just a single consensus strand + // due to softclips now that we've broken everything out into their component parts + for ( final LinkedList header : headersPosStrand ) { + if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) ) + return null; } - for (LinkedList header : headersNegStrand) { - if (header.size() > 0) - hetReads.addAll(addToSyntheticReads(header, 0, header.size(), true)); - if (runningConsensus != null) - hetReads.add(finalizeRunningConsensus()); + for ( final LinkedList header : headersNegStrand ) { + if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) ) + return null; } - removeReadsFromWindow(toRemove); + // create the polyploid synthetic reads + final ObjectList hetReads = new ObjectArrayList(); + for ( final LinkedList header : headersPosStrand ) + finalizeHetConsensus(header, false, hetReads); + for ( final LinkedList header : headersNegStrand ) + finalizeHetConsensus(header, true, hetReads); + + // remove all used reads + for ( final GATKSAMRecord read : toRemoveFromReadCache ) + readsInWindow.remove(read); + for ( final GATKSAMRecord read : toRemoveFromHeader ) + removeFromHeader(windowHeader, read); return hetReads; } + /* + * Finalizes a particular het consensus for the given header representation + * + * @param header the list of header elements representing the header for the consensus + * @param isNegativeStrand does this header represent reads on the negative strand? + * @param result list in which to store results + */ + protected void finalizeHetConsensus(final LinkedList header, final boolean isNegativeStrand, final ObjectList result) { + if ( header.size() > 0 ) + result.addAll(addToSyntheticReads(header, 0, header.size(), isNegativeStrand ? SyntheticRead.StrandType.NEGATIVE : SyntheticRead.StrandType.POSITIVE)); + if ( runningConsensus != null ) + result.add(finalizeRunningConsensus()); + } + private void addToHeader(LinkedList header, GATKSAMRecord read) { updateHeaderCounts(header, read, false); } @@ -1101,11 +1194,5 @@ public class SlidingWindow { } } } - - private void removeReadsFromWindow (final ObjectList readsToRemove) { - for (final GATKSAMRecord read : readsToRemove) { - readsInWindow.remove(read); - } - } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index 451e50286..b1ac19f50 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -76,9 +76,17 @@ import java.util.Iterator; * @since 8/26/11 */ public class SyntheticRead { - // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce - // memory footprint. - // TODO: better name + + /** + * The types of strandedness for synthetic reads + */ + public enum StrandType { + POSITIVE, + NEGATIVE, + STRANDLESS + } + + // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint. private static class SingleBaseInfo { byte baseIndexOrdinal; // enum BaseIndex.ordinal byte count; @@ -134,7 +142,7 @@ public class SyntheticRead { private String readName; private int refStart; private boolean hasIndelQualities = false; - private boolean isNegativeStrand = false; + private StrandType strandType = StrandType.STRANDLESS; /** * Full initialization of the running consensus if you have all the information and are ready to @@ -147,7 +155,7 @@ public class SyntheticRead { * @param readName the read's name * @param refStart the alignment start (reference based) */ - public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) { final int initialCapacity = 10000; basesCountsQuals = new ObjectArrayList(initialCapacity); mappingQuality = 0.0; @@ -159,10 +167,10 @@ public class SyntheticRead { this.readName = readName; this.refStart = refStart; this.hasIndelQualities = hasIndelQualities; - this.isNegativeStrand = isNegativeRead; + this.strandType = strandType; } - public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(ObjectArrayList bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) { basesCountsQuals = new ObjectArrayList(bases.size()); for (int i = 0; i < bases.size(); ++i) { basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i))); @@ -175,7 +183,7 @@ public class SyntheticRead { this.readName = readName; this.refStart = refStart; this.hasIndelQualities = hasIndelQualities; - this.isNegativeStrand = isNegativeRead; + this.strandType = strandType; } /** @@ -216,8 +224,11 @@ public class SyntheticRead { read.setReferenceIndex(contigIndex); read.setReadPairedFlag(false); read.setReadUnmappedFlag(false); - read.setReadNegativeStrandFlag(isNegativeStrand); - read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions) + if ( strandType != StrandType.STRANDLESS ) { + read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1'); // must come before next line + read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE); + } + read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions) read.setAlignmentStart(refStart); read.setReadName(readName); read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 4259dbdb6..4e13e0d9d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -599,9 +599,9 @@ public class UnifiedGenotyperEngine { int numDeletions = 0; for ( final PileupElement p : rawContext.getBasePileup() ) { if ( p.isDeletion() ) - numDeletions++; + numDeletions += p.getRepresentativeCount(); } - if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) { + if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { return null; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java new file mode 100644 index 000000000..2f8295008 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java @@ -0,0 +1,175 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.HashSet; +import java.util.Set; + +/** + * Emits intervals present in either the original or reduced bam but not the other. + * + *

Input

+ *

+ * The original and reduced BAM files. + *

+ * + *

Output

+ *

+ * A list of intervals present in one bam but not the other. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -I:original original.bam \
+ *   -I:reduced reduced.bam \
+ *   -R ref.fasta \
+ *   -T AssessReducedCoverage \
+ *   -o output.intervals
+ * 
+ * + * @author ebanks + */ +@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) +@Hidden +public class AssessReducedCoverage extends LocusWalker implements TreeReducible { + + private static final String original = "original"; + private static final String reduced = "reduced"; + + @Output + protected PrintStream out; + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) + public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; + + public void initialize() {} + + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + if ( tracker == null ) + return null; + + final Set tags = getAllTags(context.getBasePileup()); + return (tags.contains(original) && !tags.contains(reduced)) || + (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; + } + + private Set getAllTags(final ReadBackedPileup pileup) { + + final Set tags = new HashSet(10); + + for ( final PileupElement p : pileup ) { + if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) + tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); + } + + return tags; + } + + public void onTraversalDone(GenomeLoc sum) { + if ( sum != null ) + out.println(sum); + } + + public GenomeLoc reduceInit() { + return null; + } + + public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { + if ( lhs == null ) + return rhs; + + if ( rhs == null ) + return lhs; + + // if contiguous, just merge them + if ( lhs.contiguousP(rhs) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); + + // otherwise, print the lhs and start over with the rhs + out.println(lhs); + return rhs; + } + + public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { + if ( value == null ) + return sum; + + if ( sum == null ) + return value; + + // if contiguous, just merge them + if ( sum.contiguousP(value) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); + + // otherwise, print the sum and start over with the value + out.println(sum); + return value; + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java index c48c7cdc7..2f744e914 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java @@ -53,6 +53,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; public class HeaderElementUnitTest extends BaseTest { @@ -136,10 +137,12 @@ public class HeaderElementUnitTest extends BaseTest { private class AllelesTest { public final int[] counts; public final double proportion; + public final boolean allowDeletions; - private AllelesTest(final int[] counts, final double proportion) { + private AllelesTest(final int[] counts, final double proportion, final boolean allowDeletions) { this.counts = counts; this.proportion = proportion; + this.allowDeletions = allowDeletions; } } @@ -150,12 +153,16 @@ public class HeaderElementUnitTest extends BaseTest { final int[] counts = new int[]{ 0, 5, 10, 15, 20 }; final double [] proportions = new double[]{ 0.0, 0.05, 0.10, 0.50, 1.0 }; - for ( final int count1 : counts ) { - for ( final int count2 : counts ) { - for ( final int count3 : counts ) { - for ( final int count4 : counts ) { - for ( final double proportion : proportions ) { - tests.add(new Object[]{new AllelesTest(new int[]{count1, count2, count3, count4}, proportion)}); + for ( final int countA : counts ) { + for ( final int countC : counts ) { + for ( final int countG : counts ) { + for ( final int countT : counts ) { + for ( final int countD : counts ) { + for ( final double proportion : proportions ) { + for ( final boolean allowDeletions : Arrays.asList(true, false) ) { + tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, proportion, allowDeletions)}); + } + } } } } @@ -170,24 +177,27 @@ public class HeaderElementUnitTest extends BaseTest { HeaderElement headerElement = new HeaderElement(1000, 0); for ( int i = 0; i < test.counts.length; i++ ) { - BaseIndex base = BaseIndex.values()[i]; + final BaseIndex base = BaseIndex.values()[i]; for ( int j = 0; j < test.counts[i]; j++ ) headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false); } - final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion); - final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion); + final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion, test.allowDeletions); + final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion, test.allowDeletions); Assert.assertEquals(nAllelesSeen, nAllelesExpected); } - private static int calculateExpectedAlleles(final int[] counts, final double proportion) { + private static int calculateExpectedAlleles(final int[] counts, final double proportion, final boolean allowDeletions) { double total = 0.0; for ( final int count : counts ) { total += count; } - final int minCount = (int)(proportion * total); + final int minCount = Math.max(1, (int)(proportion * total)); + + if ( !allowDeletions && counts[BaseIndex.D.index] >= minCount ) + return -1; int result = 0; for ( final int count : counts ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 0cbd537ed..de95b5e9a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -47,12 +47,16 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.collections.Pair; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; +import java.util.List; public class ReduceReadsIntegrationTest extends WalkerTest { final static String REF = b37KGReference; + final static String DBSNP = b37dbSNP132; final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam"; final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam"; @@ -67,48 +71,128 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam"; final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam"; - private void RRTest(String testName, String args, String md5) { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s "; - WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); + final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e"; + + protected Pair, List> executeTest(final String name, final WalkerTestSpec spec) { + final Pair, List> result = super.executeTest(name, spec); + + // perform some Reduce Reads specific testing now + if ( result != null ) { + + // generate a new command-line based on the old one + spec.disableImplicitArgs(); + final String[] originalArgs = spec.getArgsWithImplicitArgs().split(" "); + + final StringBuilder newArgs = new StringBuilder(); + for ( int i = 0; i < originalArgs.length; i++ ) { + final String arg = originalArgs[i]; + if ( arg.equals("-T") ) { + newArgs.append("-T AssessReducedCoverage "); + } else if ( arg.startsWith("-I") ) { + newArgs.append("-I:original "); + newArgs.append(originalArgs[++i]); + newArgs.append(" "); + } else if ( arg.equals("-R") || arg.equals("-L") ) { + newArgs.append(arg); + newArgs.append(" "); + newArgs.append(originalArgs[++i]); + newArgs.append(" "); + } + } + for ( final File file : result.getFirst() ) { + newArgs.append("-I:reduced "); + newArgs.append(file.getAbsolutePath()); + newArgs.append(" "); + } + newArgs.append("-o %s"); + + super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(newArgs.toString(), Arrays.asList(emptyFileMd5))); + } + + return result; + } + + protected Pair, List> executeTestWithoutAdditionalRRTests(final String name, final WalkerTestSpec spec) { + return super.executeTest(name, spec); + } + + private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) { + String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " "; + WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5)); executeTest(testName, spec); } @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "16d97a47b8dbfae4ea64fbdf522b693c"); + RRTest("testDefaultCompression ", L, "538362abd504200800145720b23c98ce", false); } @Test(enabled = true) - public void testInsertionsAtEdgeOfConsensus() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("f7a9a27c5eaf791b67a768fff960a9e1"))); + public void testDefaultCompressionWithKnowns() { + RRTest("testDefaultCompressionWithKnowns ", L, "79cdbd997196957af63f46353cff710b", true); } + private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; + @Test(enabled = true) public void testMultipleIntervals() { - String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "8886ba383e21883241b386882e8e5063"); + RRTest("testMultipleIntervals ", intervals, "6733b25e87e3fce5753cf7936ccf934f", false); + } + + @Test(enabled = true) + public void testMultipleIntervalsWithKnowns() { + RRTest("testMultipleIntervalsWithKnowns ", intervals, "99e2a79befc71eaadb4197c66a0d6df8", true); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "54253f25d363852a1182aff33e500b92"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e3b7e14655973c8950d7fec96321e483", false); + } + + @Test(enabled = true) + public void testHighCompressionWithKnowns() { + RRTest("testHighCompressionWithKnowns ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "30a7ed079b3a41ed63e520260fa6afe3", true); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "1d7d2d28900db57dad65a8beef64b8cb"); + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", false); + } + + @Test(enabled = true) + public void testLowCompressionWithKnowns() { + RRTest("testLowCompressionWithKnowns ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", true); } @Test(enabled = true) public void testIndelCompression() { - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f58ae2154e0e5716be0e850b7605856e"); + final String md5 = "f58ae2154e0e5716be0e850b7605856e"; + RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false); + RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true); } @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bfe0693aea74634f1035a9bd11302517"))); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bfe0693aea74634f1035a9bd11302517"))); + } + + @Test(enabled = true) + public void testCoReduction() { + String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("930ec2e2c3b62bec7a2425a82c64f022"))); + } + + @Test(enabled = true) + public void testCoReductionWithKnowns() { + String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; + executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fe7c9fd35e50a828e0f38a7ae25b60a7"))); + } + + @Test(enabled = true) + public void testInsertionsAtEdgeOfConsensus() { + String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; + executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4445db7aeddaf2f1d86e1af0cdc74c8"))); } /** @@ -122,7 +206,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("f118e83c394d21d901a24230379864fc"))); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("f118e83c394d21d901a24230379864fc"))); } /** @@ -132,26 +216,20 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bd5198a3e21034887b741faaaa3964bf"))); - } - - @Test(enabled = true) - public void testCoReduction() { - String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("81312c31b9910a42bff6acb5167592ab"))); + // we expect to lose coverage due to the downsampling so don't run the systematic coverage test + executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bd5198a3e21034887b741faaaa3964bf"))); } /** - * Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will + * Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will * break the current hard-clipping routine that protects reduce reads from such reads. */ @Test(enabled = true) public void testReadOffContig() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("b4dc66445ddf5f467f67860bed023ef8"))); + executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4dc66445ddf5f467f67860bed023ef8"))); } - /** * Confirm that if both ends of pair are in same variant region, compressed names of both ends of pair are the same. */ @@ -159,7 +237,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testPairedReadsInVariantRegion() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + " -o %s --downsample_coverage 250 -dcov 50 "; - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("9bed260b6245f5ff47db8541405504aa"))); + executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9bed260b6245f5ff47db8541405504aa"))); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java index b9399bb1b..15b79b78a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java @@ -49,12 +49,29 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; +import net.sf.samtools.SAMFileHeader; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Random; @@ -96,7 +113,7 @@ public class ReduceReadsUnitTest extends BaseTest { /** * Test the read name compression functionality */ - @Test(dataProvider = "ReadNameProvider") + @Test(dataProvider = "ReadNameProvider", enabled = false) public void testReadNameCompression(final String name, final boolean alreadySeen) { GATKSAMRecord read = GATKSAMRecord.createRandomRead(1); read.setReadName(name); @@ -108,4 +125,90 @@ public class ReduceReadsUnitTest extends BaseTest { Assert.assertTrue(hash.containsKey(name)); } + + ///////////////////////////////////////////////////////////////////////////// + //// This section tests the functionality related to known SNP positions //// + ///////////////////////////////////////////////////////////////////////////// + + private static SAMFileHeader header; + private static GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 100); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + @DataProvider(name = "PopulateKnownsProvider") + public Object[][] populateKnownsProvider() { + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + final Allele AC = Allele.create("AC"); + + final VariantContext snp_1_10 = new VariantContextBuilder("known", "chr1", 10, 10, Arrays.asList(A, C)).make(); + final VariantContext snp_1_10_2 = new VariantContextBuilder("known", "chr1", 10, 10, Arrays.asList(A, G)).make(); + final VariantContext snp_1_20 = new VariantContextBuilder("known", "chr1", 20, 20, Arrays.asList(A, C)).make(); + final VariantContext snp_1_30 = new VariantContextBuilder("known", "chr1", 30, 30, Arrays.asList(A, C)).make(); + final VariantContext snp_2_10 = new VariantContextBuilder("known", "chr2", 10, 10, Arrays.asList(A, C)).make(); + final VariantContext snp_3_10 = new VariantContextBuilder("known", "chr3", 10, 10, Arrays.asList(A, C)).make(); + final VariantContext indel_1_40 = new VariantContextBuilder("known", "chr1", 40, 40, Arrays.asList(A, AC)).make(); + final VariantContext indel_2_40 = new VariantContextBuilder("known", "chr2", 40, 40, Arrays.asList(A, AC)).make(); + + final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "foo1", 0, 1, 1); + final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "foo2", 1, 1, 1); + final GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "foo3", 2, 1, 1); + + final ObjectArrayList tests = new ObjectArrayList(); + + // test single + tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10))}); + + // test multiple at one position + tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_10_2))}); + + // test multiple + tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); + + // test indel not used + tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_1_40))}); + tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_2_40))}); + + // test read clears + tests.add(new Object[]{3, 0, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); + tests.add(new Object[]{4, 1, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))}); + tests.add(new Object[]{3, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); + tests.add(new Object[]{4, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))}); + tests.add(new Object[]{4, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_3_10))}); + tests.add(new Object[]{5, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10), makeRefMetaDataTracker(snp_3_10))}); + + return tests.toArray(new Object[][]{}); + } + + private final RefMetaDataTracker makeRefMetaDataTracker(final Feature feature) { + final List x = new ArrayList(); + x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, feature, "known")); + final RODRecordList rods = new RODRecordListImpl("known", x, genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd())); + return new RefMetaDataTracker(Arrays.asList(rods)); + } + + @Test(dataProvider = "PopulateKnownsProvider") + public void testPopulateKnowns(final int expectedSizeBeforeClear, final int expectedSizeAfterClear, final GATKSAMRecord read, final List trackers) { + final ReduceReads rr = new ReduceReads(); + RodBinding.resetNameCounter(); + rr.known = Arrays.>asList(new RodBinding(VariantContext.class, "known")); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + rr.setToolkit(engine); + + for ( final RefMetaDataTracker tracker : trackers ) + rr.populateKnownSNPs(tracker); + Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeBeforeClear); + + rr.clearStaleKnownPositions(read); + Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeAfterClear); + } + } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index 054f7aa15..f081b9f8a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -46,9 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import it.unimi.dsi.fastutil.objects.ObjectList; -import it.unimi.dsi.fastutil.objects.ObjectSet; +import it.unimi.dsi.fastutil.objects.*; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; @@ -317,6 +315,7 @@ public class SlidingWindowUnitTest extends BaseTest { private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295); private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309); private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310); + private static final GenomeLoc loc312 = new UnvalidatingGenomeLoc("1", 0, 1000312, 1000312); private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100); @DataProvider(name = "ConsensusCreation") @@ -325,10 +324,11 @@ public class SlidingWindowUnitTest extends BaseTest { // test high quality reads and bases tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 5)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 6)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc312), false, false, 11, 8)}); // test low quality reads tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1)}); @@ -349,8 +349,7 @@ public class SlidingWindowUnitTest extends BaseTest { tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 3, 3)}); // test I/D operators - // TODO -- uncomment this test when the deletion bug is fixed! - // tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 5)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 9)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 11, 11)}); @@ -364,23 +363,66 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "ConsensusCreation", enabled = true) public void testConsensusCreationTest(ConsensusCreationTest test) { - // test WITHOUT het compression allowed - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false); + final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); + + // test WITHOUT het compression + SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(); + Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads); - // test WITH het compression allowed - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, true); + // test WITH het compression + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); - result = slidingWindow.close(); + for ( int i = 0; i < 1200; i++ ) + knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); + result = slidingWindow.close(knownSNPs); Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); } + @Test + public void testConsensusCreationForMultiallelic() { + + final int totalNumReads = 7; + final ObjectList myReads = new ObjectArrayList(totalNumReads); + + for ( int i = 0; i < totalNumReads; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(30); + read.setReadNegativeStrandFlag(false); + + final char base = i < totalNumReads - 2 ? 'A' : ( i == totalNumReads - 2 ? 'C' : 'G'); + read.setReadBases(Utils.dupBytes((byte) base, readLength)); + + myReads.add(read); + } + + final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); + + // test WITHOUT het compression + SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty + + Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all + + // test WITH het compression + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + for ( int i = 0; i < readLength; i++ ) + knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); + result = slidingWindow.close(knownSNPs); + + Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all + } + /////////////////////////////////////////////////////////// //// This section tests the downsampling functionality //// @@ -398,7 +440,7 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "Downsampling", enabled = true) public void testDownsamplingTest(final int dcov) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); final ObjectList result = slidingWindow.downsampleVariantRegion(basicReads); Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size())); @@ -446,10 +488,10 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(dataProvider = "ConsensusQuals", enabled = true) public void testConsensusQualsTest(QualsTest test) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); - final Pair, CompressionStash> result = slidingWindow.close(); + final Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); Assert.assertEquals(result.getFirst().size(), 1); final GATKSAMRecord read = result.getFirst().iterator().next(); @@ -515,7 +557,7 @@ public class SlidingWindowUnitTest extends BaseTest { read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); read.setMappingQuality(30); - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start); Assert.assertEquals(newIndex, start > 0 ? start : 0); @@ -559,7 +601,7 @@ public class SlidingWindowUnitTest extends BaseTest { read.setMappingQuality(30); // add the read - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false, false); + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start); for ( int i = 0; i < start; i++ ) Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); @@ -573,4 +615,84 @@ public class SlidingWindowUnitTest extends BaseTest { for ( int i = 0; i < currentHeaderLength; i++ ) Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); } + + + ////////////////////////////////////////////////////////////////////////////////// + //// This section tests functionality related to polyploid consensus creation //// + ////////////////////////////////////////////////////////////////////////////////// + + @DataProvider(name = "MatchesKnownProvider") + public Object[][] matchesKnownProvider() { + + final ObjectArrayList tests = new ObjectArrayList(); + + // test no knowns + tests.add(new Object[]{new ObjectAVLTreeSet(), loc290.getStart(), false}); + + final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet(); + knownSnpPositions.add(loc290); + knownSnpPositions.add(loc295); + knownSnpPositions.add(loc310); + + // test overlap + tests.add(new Object[]{knownSnpPositions, loc290.getStart(), true}); + tests.add(new Object[]{knownSnpPositions, loc295.getStart(), true}); + tests.add(new Object[]{knownSnpPositions, loc310.getStart(), true}); + tests.add(new Object[]{knownSnpPositions, loc309.getStart(), false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MatchesKnownProvider") + public void testMatchesKnown(final ObjectSortedSet knownSnpPositions, final int targetLoc, final boolean expectedResult) { + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10); + Assert.assertEquals(slidingWindow.matchesKnownPosition(targetLoc, knownSnpPositions), expectedResult); + } + + @DataProvider(name = "SignificantSoftclipsProvider") + public Object[][] SignificantSoftclipsTestData() { + List tests = new ArrayList(); + + for ( final int indexWithSoftclips : Arrays.asList(-1, 0, 5, 9) ) { + for ( final int indexToSkip : Arrays.asList(-1, 0, 5, 9) ) { + tests.add(new Object[]{indexWithSoftclips, indexToSkip}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SignificantSoftclipsProvider", enabled = true) + public void significantSoftclipsTest(final int indexWithSoftclips, final int indexToSkip) { + + // set up the window header + final int currentHeaderStart = 100; + final int currentHeaderLength = 10; + final LinkedList windowHeader = new LinkedList(); + for ( int i = 0; i < currentHeaderLength; i++ ) + windowHeader.add(new HeaderElement(currentHeaderStart + i)); + + // set up the normal read + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, currentHeaderLength); + read.setReadBases(Utils.dupBytes((byte) 'A', currentHeaderLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, currentHeaderLength)); + read.setMappingQuality(30); + + // add the read + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); + slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0); + + // set up and add a soft-clipped read if requested + if ( indexWithSoftclips != -1 ) { + final GATKSAMRecord softclippedRead = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + indexWithSoftclips, 1); + softclippedRead.setReadBases(new byte[]{(byte) 'A'}); + softclippedRead.setBaseQualities(new byte[]{(byte) 30}); + softclippedRead.setMappingQuality(30); + softclippedRead.setCigarString("1S"); + slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips); + } + + final boolean result = slidingWindow.hasSignificantSoftclipPosition(windowHeader, currentHeaderStart + indexToSkip); + Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java index 570b797ca..6886568e8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java @@ -77,7 +77,7 @@ public void testBaseCounts() { new TestRead(bases, quals, new byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})}; for (TestRead testRead : testReads) { - SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false); + SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, SyntheticRead.StrandType.STRANDLESS); Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts()); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index b5fe79993..0620f15df 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -69,18 +69,18 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "866c19ba60862ad1569d88784423ec8c"); + testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "3e01f990c7a7c25fd9e42be559ca2942"); + testReducedCalling("INDEL", "9a702e7a85465f6c42d6c1828aee6c38"); } private void testReducedCalling(final String model, final String md5) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, Arrays.asList(md5)); executeTest("test calling on a ReducedRead BAM with " + model, spec); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c5f9f606b..01f39a67b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -52,6 +52,7 @@ import java.util.*; public class GATKSAMRecord extends BAMRecord { // ReduceReads specific attribute tags public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool + public static final String REDUCED_READ_STRANDED_TAG = "RS"; // marks a stranded synthetic read produced by the ReduceReads tool public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end @@ -74,7 +75,7 @@ public class GATKSAMRecord extends BAMRecord { private int softEnd = UNINITIALIZED; private Integer adapterBoundary = null; - private boolean isStrandlessRead = false; + private Boolean isStrandlessRead = null; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; @@ -158,6 +159,9 @@ public class GATKSAMRecord extends BAMRecord { * @return true if this read doesn't have meaningful strand information */ public boolean isStrandless() { + if ( isStrandlessRead == null ) { + isStrandlessRead = isReducedRead() && getCharacterAttribute(REDUCED_READ_STRANDED_TAG) == null; + } return isStrandlessRead; } @@ -175,7 +179,7 @@ public class GATKSAMRecord extends BAMRecord { } @Override - public void setReadNegativeStrandFlag(boolean flag) { + public void setReadNegativeStrandFlag(final boolean flag) { if ( isStrandless() ) throw new IllegalStateException("Cannot set the strand of a strandless read"); super.setReadNegativeStrandFlag(flag); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java index 6472a10bb..c07bf171a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java @@ -72,7 +72,7 @@ public class CallableLociIntegrationTest extends WalkerTest { public void testWithReducedRead() { String gatk_args = reduceReadArgs + " -L 20:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, - Arrays.asList("684069ffe94a1175051066ed53f0fd9d", "ebc310cf734d98e26d2d83e16b1144d1")); + Arrays.asList("69fc303c888fd1fa2937b9518dc82f9e", "f512a85c373087ce03a24ab0f98522c0")); executeTest("CallableLoci with ReducedRead", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 38840fab1..18a501b51 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -64,7 +64,6 @@ public class GATKSAMRecordUnitTest extends BaseTest { for (int i = 0; i < reducedRead.getReadLength(); i++) { Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); } - Assert.assertEquals(reducedRead.isStrandless(), false, "Reduced reads don't have meaningful strandedness information"); } @Test