diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index ed5802d38..0e434b4af 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -220,4 +220,8 @@ import java.util.Map; return 0.0; return (double) counts.get(index) / totalCountWithoutIndels(); } + + public Object[] countsArray() { + return (Object []) counts.values().toArray(); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 6b92046de..3fc438b19 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.Arrays; import java.util.LinkedList; /** @@ -200,5 +201,28 @@ public class HeaderElement { return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual; } + /** + * Calculates the number of haplotypes necessary to represent this site. + * + * @param minVariantProportion the minimum proportion to call a site variant. + * @return the number of haplotypes necessary to represent this site. + */ + public int getNumberOfHaplotypes(double minVariantProportion) { + int nHaplotypes = 0; + int totalCount = consensusBaseCounts.totalCount(); + int runningCount = 0; + if (totalCount == 0) + return 0; + + Object[] countsArray = consensusBaseCounts.countsArray(); + Arrays.sort(countsArray); + for (int i = countsArray.length-1; i>=0; i--) { + nHaplotypes++; + runningCount += (Integer) countsArray[i]; + if (runningCount/totalCount > minVariantProportion) + break; + } + return nHaplotypes; + } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 44971ca38..9b2f0bc12 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -53,11 +53,12 @@ public class MultiSampleCompressor implements Compressor { final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy) { + final ReduceReads.DownsampleStrategy downsampleStrategy, + final int nContigs) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, - new SingleSampleCompressor(name, contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + new SingleSampleCompressor(contextSize, downsampleCoverage, + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index d1ec9c474..0def4e582 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -52,23 +52,23 @@ import java.util.*; /** * Reduces the BAM file using read based compression that keeps only essential information for variant calling - *
+ * ** This walker will generated reduced versions of the BAM files that still follow the BAM spec * and contain all the information necessary for the GSA variant calling pipeline. Some options * allow you to tune in how much compression you want to achieve. The default values have been * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the * savings in file size and performance of the downstream tools. - *
+ * ** The BAM file to be compressed *
- * + * ** The compressed (reduced) BAM file. - *
+ * * *@@ -86,13 +86,13 @@ import java.util.*; public class ReduceReads extends ReadWalker, ReduceReadsStash> { @Output - protected StingSAMFileWriter out; + private StingSAMFileWriter out; /** * The number of bases to keep around mismatches (potential variation) */ @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false) - protected int contextSize = 10; + private int contextSize = 10; /** * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have @@ -100,7 +100,7 @@ public class ReduceReads extends ReadWalker , ReduceRea * towards variable regions. */ @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false) - protected int minMappingQuality = 20; + private int minMappingQuality = 20; /** * The minimum base quality to be considered for the consensus synthetic read. Reads that have @@ -108,35 +108,35 @@ public class ReduceReads extends ReadWalker , ReduceRea * towards variable regions. */ @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) - protected byte minBaseQual = 20; + private byte minBaseQual = 20; /** * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality * lower than this threshold will be hard clipped off before entering the reduce reads algorithm. */ @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) - protected byte minTailQuality = 2; + private byte minTailQuality = 2; /** * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals * and read group). */ @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false) - protected boolean DONT_SIMPLIFY_READS = false; + private boolean DONT_SIMPLIFY_READS = false; /** * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired. * The program will behave correctly in those cases. */ @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false) - protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; + private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; /** * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail * quality. */ @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false) - protected boolean DONT_CLIP_LOW_QUAL_TAILS = false; + private boolean DONT_CLIP_LOW_QUAL_TAILS = false; /** * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped @@ -144,7 +144,7 @@ public class ReduceReads extends ReadWalker , ReduceRea * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) */ @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false) - protected boolean DONT_USE_SOFTCLIPPED_BASES = false; + private boolean DONT_USE_SOFTCLIPPED_BASES = false; /** * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee @@ -152,47 +152,56 @@ public class ReduceReads extends ReadWalker , ReduceRea * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. */ @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false) - protected boolean DONT_COMPRESS_READ_NAMES = false; + private boolean DONT_COMPRESS_READ_NAMES = false; /** * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval * border. */ @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false) - protected boolean HARD_CLIP_TO_INTERVAL = false; + private boolean HARD_CLIP_TO_INTERVAL = false; /** * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be * considered consensus. */ @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) - protected double minAltProportionToTriggerVariant = 0.05; + private double minAltProportionToTriggerVariant = 0.05; /** * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be * considered consensus. */ @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false) - protected double minIndelProportionToTriggerVariant = 0.05; + private double minIndelProportionToTriggerVariant = 0.05; + + /** + * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be + * considered consensus. + */ + @Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false) + private int nContigs = 2; + + /** * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this). * A value of 0 turns downsampling off. */ @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) - protected int downsampleCoverage = 250; + private int downsampleCoverage = 250; @Hidden @Argument(fullName = "", shortName = "dl", doc = "", required = false) - protected int debugLevel = 0; + private int debugLevel = 0; @Hidden @Argument(fullName = "", shortName = "dr", doc = "", required = false) - protected String debugRead = ""; + private String debugRead = ""; @Hidden @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false) - protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; + private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; @Hidden @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) @@ -203,7 +212,6 @@ public class ReduceReads extends ReadWalker , ReduceRea Adaptive } - protected int totalReads = 0; int nCompressedReads = 0; HashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). @@ -249,7 +257,6 @@ public class ReduceReads extends ReadWalker , ReduceRea @Override public LinkedList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { LinkedList mappedReads; - totalReads++; if (!debugRead.isEmpty() && read.getReadName().contains(debugRead)) System.out.println("Found debug read!"); @@ -316,7 +323,7 @@ public class ReduceReads extends ReadWalker , ReduceRea */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs)); } /** @@ -532,8 +539,6 @@ public class ReduceReads extends ReadWalker , ReduceRea read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start) if (endShift > 0) read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end) - - totalReads++; } if (debugLevel == 1) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 6d2c2d215..f1a7b248f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -8,35 +7,31 @@ import java.util.TreeSet; /** * - * @author depristo - * @version 0.1 + * @author carneiro, depristo + * @version 3.0 */ public class SingleSampleCompressor implements Compressor { - protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class); + final private int contextSize; + final private int downsampleCoverage; + final private int minMappingQuality; + final private double minAltProportionToTriggerVariant; + final private double minIndelProportionToTriggerVariant; + final private int minBaseQual; + final private ReduceReads.DownsampleStrategy downsampleStrategy; + final private int nContigs; - protected final int contextSize; - protected final int downsampleCoverage; - protected int minMappingQuality; - protected int slidingWindowCounter; + private SlidingWindow slidingWindow; + private int slidingWindowCounter; - protected final String sampleName; - protected SlidingWindow slidingWindow; - protected double minAltProportionToTriggerVariant; - protected double minIndelProportionToTriggerVariant; - protected int minBaseQual; - - protected ReduceReads.DownsampleStrategy downsampleStrategy; - - public SingleSampleCompressor(final String sampleName, - final int contextSize, + public SingleSampleCompressor(final int contextSize, final int downsampleCoverage, final int minMappingQuality, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy) { - this.sampleName = sampleName; + final ReduceReads.DownsampleStrategy downsampleStrategy, + final int nContigs) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -45,6 +40,7 @@ public class SingleSampleCompressor implements Compressor { this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; this.minBaseQual = minBaseQual; this.downsampleStrategy = downsampleStrategy; + this.nContigs = nContigs; } /** @@ -66,7 +62,7 @@ public class SingleSampleCompressor implements Compressor { } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs); slidingWindowCounter++; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 2e67b91bb..326ae965a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -8,14 +8,12 @@ import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.ListIterator; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -56,6 +54,8 @@ public class SlidingWindow { protected ReduceReads.DownsampleStrategy downsampleStrategy; private boolean hasIndelQualities; + private final int nContigs; + /** * The types of synthetic reads to use in the finalizeAndAdd method */ @@ -77,12 +77,12 @@ public class SlidingWindow { return contigIndex; } - public int getStartLocation() { - return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation(); + public int getStartLocation(LinkedList header) { + return header.isEmpty() ? -1 : header.peek().getLocation(); } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs) { this.stopLocation = -1; this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -111,6 +111,7 @@ public class SlidingWindow { this.downsampleStrategy = downsampleStrategy; this.hasIndelQualities = hasIndelQualities; + this.nContigs = nContigs; } /** @@ -125,7 +126,7 @@ public class SlidingWindow { * @return a list of reads that have been finished by sliding the window. */ public List addRead(GATKSAMRecord read) { - updateHeaderCounts(read, false); // update the window header counts + addToHeader(windowHeader, read); // update the window header counts readsInWindow.add(read); // add read to sliding reads return slideWindow(read.getUnclippedStart()); } @@ -191,9 +192,9 @@ public class SlidingWindow { protected List slideWindow(int incomingReadUnclippedStart) { List finalizedReads = new LinkedList (); - if (incomingReadUnclippedStart - contextSize > getStartLocation()) { - int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation(); - boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex); + if (incomingReadUnclippedStart - contextSize > getStartLocation(windowHeader)) { + int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation(windowHeader); + boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex); int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) List > regions = getAllVariantRegions(0, breakpoint, variantSite); @@ -201,7 +202,7 @@ public class SlidingWindow { List readsToRemove = new LinkedList (); for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!) - if (read.getAlignmentEnd() < getStartLocation()) { + if (read.getAlignmentEnd() < getStartLocation(windowHeader)) { readsToRemove.add(read); } } @@ -222,15 +223,15 @@ public class SlidingWindow { */ protected boolean[] markSites(int stop) { - boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1]; + boolean[] markedSites = new boolean[stop - getStartLocation(windowHeader) + contextSize + 1]; Iterator headerElementIterator = windowHeader.iterator(); - for (int i = getStartLocation(); i < stop; i++) { + for (int i = getStartLocation(windowHeader); i < stop; i++) { if (headerElementIterator.hasNext()) { HeaderElement headerElement = headerElementIterator.next(); if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) - markVariantRegion(markedSites, i - getStartLocation()); + markVariantRegion(markedSites, i - getStartLocation(windowHeader)); } else break; @@ -260,14 +261,13 @@ public class SlidingWindow { * @param end the first header index NOT TO add to consensus * @return a list of consensus reads generated by this call. Empty list if no consensus was generated. */ - protected List addToSyntheticReads(int start, int end) { + protected List addToSyntheticReads(List header, int start, int end) { LinkedList reads = new LinkedList (); if (start < end) { - - ListIterator headerElementIterator = windowHeader.listIterator(start); + ListIterator headerElementIterator = header.listIterator(start); if (!headerElementIterator.hasNext()) - throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end)); + throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end)); HeaderElement headerElement = headerElementIterator.next(); @@ -280,7 +280,7 @@ public class SlidingWindow { if (endOfConsensus <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start)); - reads.addAll(addToSyntheticReads(endOfConsensus, end)); + reads.addAll(addToSyntheticReads(header, endOfConsensus, end)); } else if (headerElement.hasFilteredData()) { reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); @@ -290,7 +290,7 @@ public class SlidingWindow { if (endOfFilteredData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start)); - reads.addAll(addToSyntheticReads(endOfFilteredData, end)); + reads.addAll(addToSyntheticReads(header, endOfFilteredData, end)); } else if (headerElement.isEmpty()) { reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); @@ -299,7 +299,7 @@ public class SlidingWindow { if (endOfEmptyData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); - reads.addAll(addToSyntheticReads(endOfEmptyData, end)); + reads.addAll(addToSyntheticReads(header, endOfEmptyData, end)); } else throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start)); @@ -474,6 +474,55 @@ public class SlidingWindow { syntheticRead.add(base, count, qual, insQual, delQual, rms); } + private List compressVariantRegion(int start, int stop) { + List allReads = new LinkedList (); + + // Try to compress into a polyploid consensus + int nHaplotypes = 0; + int hetRefPosition = -1; + boolean canCompress = true; + boolean foundEvent = false; + Object[] header = windowHeader.toArray(); + for (int i = start; i<=stop; i++) { + nHaplotypes = Math.max(nHaplotypes, ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT)); + if (nHaplotypes > nContigs) { + canCompress = false; + break; + } + + // guarantees that there is only 1 site in the variant region that needs more than one haplotype + if (nHaplotypes > 1) { + if (!foundEvent) { + foundEvent = true; + hetRefPosition = i; + } + else { + canCompress = false; + break; + } + } + } + + int refStart = windowHeader.get(start).getLocation(); + int refStop = windowHeader.get(stop).getLocation(); + + // Try to compress the variant region + if (canCompress) { + allReads = createPolyploidConsensus(start, stop, nHaplotypes, hetRefPosition); + } + + // Return all reads that overlap the variant region and remove them read from the window header entirely + else { + for (GATKSAMRecord read : readsInWindow) { + if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) { + allReads.add(read); + removeFromHeader(windowHeader, read); + } + } + } + return allReads; + } + /** * Finalizes a variant region, any adjacent synthetic reads. * @@ -483,20 +532,10 @@ public class SlidingWindow { */ @Requires("start <= stop") protected List closeVariantRegion(int start, int stop) { - List allReads = new LinkedList (); - - int refStart = windowHeader.get(start).getLocation(); // All operations are reference based, not read based - int refStop = windowHeader.get(stop).getLocation(); - - for (GATKSAMRecord read : readsInWindow) { // Keep all reads that overlap the variant region - if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) { - allReads.add(read); - updateHeaderCounts(read, true); // Remove this read from the window header entirely - } - } + List allReads = compressVariantRegion(start, stop); List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; - result.addAll(addToSyntheticReads(0, start)); + result.addAll(addToSyntheticReads(windowHeader, 0, start)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); for (GATKSAMRecord read : allReads) { @@ -566,7 +605,7 @@ public class SlidingWindow { finalizedReads = closeVariantRegions(regions, true); if (!windowHeader.isEmpty()) { - finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1)); + finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size() - 1)); finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up } @@ -611,13 +650,75 @@ public class SlidingWindow { } + + private List createPolyploidConsensus(int start, int stop, int nHaplotypes, int hetRefPosition) { + // we will create two (positive strand, negative strand) headers for each contig + List > headersPosStrand = new ArrayList >(); + List > headersNegStrand = new ArrayList >(); + Map haplotypeHeaderMap = new HashMap (nHaplotypes); + int currentHaplotype = 0; + int refStart = windowHeader.get(start).getLocation(); + int refStop = windowHeader.get(stop).getLocation(); + + for (GATKSAMRecord read : readsInWindow) { + int haplotype = -1; + + // check if the read is inside the variant region + if ( read.getMappingQuality() > MIN_MAPPING_QUALITY && (read.getSoftStart() <= refStop && read.getSoftEnd() >= refStart)) { + + // check if the read contains the het site + if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) { + int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); + byte base = read.getReadBases()[readPos]; + byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos]; + + // check if base passes the filters! + if (qual > MIN_BASE_QUAL_TO_COUNT) { + // check which haplotype this read represents and take the index of it from the list of headers + if (haplotypeHeaderMap.containsKey(base)) { + haplotype = haplotypeHeaderMap.get(base); + } + // create new lists if this haplotype has not been seen yet + else { + haplotype = ++currentHaplotype; + haplotypeHeaderMap.put(base, currentHaplotype); + headersPosStrand.add(new LinkedList ()); + headersNegStrand.add(new LinkedList ()); + } + } + LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype); + addToHeader(header, read); + } + } + } + + List hetReads = new LinkedList (); + for (List header : headersPosStrand) { + hetReads.addAll(addToSyntheticReads(header, 0, header.size())); + hetReads.add(finalizeRunningConsensus()); + } + return hetReads; + } + + + private void addToHeader(LinkedList header, GATKSAMRecord read) { + updateHeaderCounts(header, read, false); + } + + private void removeFromHeader(LinkedList header, GATKSAMRecord read) { + updateHeaderCounts(header, read, true); + } + + /** * Updates the sliding window's header counts with the incoming read bases, insertions * and deletions. * + * @param header the sliding window header to use * @param read the incoming read to be added to the sliding window + * @param removeRead if we are removing the read from the header or adding */ - protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) { + private void updateHeaderCounts(LinkedList header, GATKSAMRecord read, boolean removeRead) { byte[] bases = read.getReadBases(); byte[] quals = read.getBaseQualities(); byte[] insQuals = read.getExistingBaseInsertionQualities(); @@ -627,7 +728,7 @@ public class SlidingWindow { Cigar cigar = read.getCigar(); int readBaseIndex = 0; - int startLocation = getStartLocation(); + int startLocation = getStartLocation(header); int locationIndex = startLocation < 0 ? 0 : readStart - startLocation; if (removeRead && locationIndex < 0) @@ -636,7 +737,7 @@ public class SlidingWindow { if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window for (int i = 1; i <= -locationIndex; i++) - windowHeader.addFirst(new HeaderElement(startLocation - i)); + header.addFirst(new HeaderElement(startLocation - i)); startLocation = readStart; // update start location accordingly locationIndex = 0; @@ -645,19 +746,19 @@ public class SlidingWindow { if (stopLocation < readEnd) { // Do we need to add extra elements to the header? int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation; while (elementsToAdd-- > 0) - windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd)); + header.addLast(new HeaderElement(readEnd - elementsToAdd)); stopLocation = readEnd; // update stopLocation accordingly } // Special case for leading insertions before the beginning of the sliding read if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) { - windowHeader.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added + header.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing. } } - Iterator headerElementIterator = windowHeader.listIterator(locationIndex); + Iterator headerElementIterator = header.listIterator(locationIndex); HeaderElement headerElement; for (CigarElement cigarElement : cigar.getCigarElements()) { switch (cigarElement.getOperator()) { @@ -668,7 +769,7 @@ public class SlidingWindow { break; } - headerElement = windowHeader.get(locationIndex - 1); // insertions are added to the base to the left (previous element) + headerElement = header.get(locationIndex - 1); // insertions are added to the base to the left (previous element) if (removeRead) { headerElement.removeInsertionToTheRight();