(new AlignmentStartWithNoTiesComparator());
+ for ( SingleSampleCompressor comp : compressorsPerSample.values() )
+ for ( GATKSAMRecord read : comp.close() )
+ reads.add(read);
+ return reads;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
new file mode 100644
index 000000000..d793ae159
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import net.sf.samtools.util.SequenceUtil;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Hidden;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.filters.*;
+import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.PartitionBy;
+import org.broadinstitute.sting.gatk.walkers.PartitionType;
+import org.broadinstitute.sting.gatk.walkers.ReadFilters;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocComparator;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.clipping.ReadClipper;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+
+import java.util.*;
+
+/**
+ * Reduces the BAM file using read based compression that keeps only essential information for variant calling
+ *
+ *
+ * This walker will generated reduced versions of the BAM files that still follow the BAM spec
+ * and contain all the information necessary for the GSA variant calling pipeline. Some options
+ * allow you to tune in how much compression you want to achieve. The default values have been
+ * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
+ * savings in file size and performance of the downstream tools.
+ *
+ * Input
+ *
+ * The BAM file to be compressed
+ *
+ *
+ * Output
+ *
+ * The compressed (reduced) BAM file.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T ReduceReads \
+ * -I myData.bam \
+ * -o myData.reduced.bam
+ *
+ */
+
+@PartitionBy(PartitionType.INTERVAL)
+@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
+public class ReduceReads extends ReadWalker, ReduceReadsStash> {
+
+ @Output
+ protected StingSAMFileWriter out;
+
+ /**
+ * The number of bases to keep around mismatches (potential variation)
+ */
+ @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
+ protected int contextSize = 10;
+
+ /**
+ * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
+ * mapping quality below this threshold will not be counted towards consensus, but are still counted
+ * towards variable regions.
+ */
+ @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
+ protected int minMappingQuality = 20;
+
+ /**
+ * The minimum base quality to be considered for the consensus synthetic read. Reads that have
+ * base quality below this threshold will not be counted towards consensus, but are still counted
+ * towards variable regions.
+ */
+ @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
+ protected byte minBaseQual = 20;
+
+ /**
+ * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
+ * lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
+ */
+ @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
+ protected byte minTailQuality = 2;
+
+ /**
+ * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
+ * and read group).
+ */
+ @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
+ protected boolean DONT_SIMPLIFY_READS = false;
+
+ /**
+ * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
+ * The program will behave correctly in those cases.
+ */
+ @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
+ protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
+
+ /**
+ * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
+ * quality.
+ */
+ @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
+ protected boolean DONT_CLIP_LOW_QUAL_TAILS = false;
+
+ /**
+ * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
+ * base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant
+ * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
+ */
+ @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
+ protected boolean DONT_USE_SOFTCLIPPED_BASES = false;
+
+ /**
+ * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
+ * uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather
+ * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
+ */
+ @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
+ protected boolean DONT_COMPRESS_READ_NAMES = false;
+
+ /**
+ * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
+ * border.
+ */
+ @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
+ protected boolean HARD_CLIP_TO_INTERVAL = false;
+
+ /**
+ * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
+ * considered consensus.
+ */
+ @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
+ protected double minAltProportionToTriggerVariant = 0.05;
+
+ /**
+ * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
+ * considered consensus.
+ */
+ @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
+ protected double minIndelProportionToTriggerVariant = 0.05;
+
+ /**
+ * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
+ * A value of 0 turns downsampling off.
+ */
+ @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
+ protected int downsampleCoverage = 0;
+
+ @Hidden
+ @Argument(fullName = "", shortName = "dl", doc = "", required = false)
+ protected int debugLevel = 0;
+
+ @Hidden
+ @Argument(fullName = "", shortName = "dr", doc = "", required = false)
+ protected String debugRead = "";
+
+ @Hidden
+ @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
+ protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
+
+ @Hidden
+ @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
+ private boolean NO_PG_TAG = false;
+
+ public enum DownsampleStrategy {
+ Normal,
+ Adaptive
+ }
+
+ protected int totalReads = 0;
+ int nCompressedReads = 0;
+
+ HashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
+ Long nextReadNumber = 1L; // The next number to use for the compressed read name.
+
+ SortedSet intervalList;
+
+ private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
+
+ /**
+ * Basic generic initialization of the readNameHash and the intervalList. Output initialization
+ * is done at the reduceInit method
+ */
+ @Override
+ public void initialize() {
+ super.initialize();
+ GenomeAnalysisEngine toolkit = getToolkit();
+ readNameHash = new HashMap(); // prepare the read name hash to keep track of what reads have had their read names compressed
+ intervalList = new TreeSet(new GenomeLocComparator()); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
+
+ if (toolkit.getIntervals() != null)
+ intervalList.addAll(toolkit.getIntervals());
+
+ if (!NO_PG_TAG)
+ Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME);
+ else
+ out.setPresorted(false);
+ }
+
+ /**
+ * Takes in a read and prepares it for the SlidingWindow machinery by performing the
+ * following optional clipping operations:
+ * 1. Hard clip adaptor sequences
+ * 2. Hard clip low quality tails
+ * 3. Hard clip all remaining soft clipped bases
+ * 4. Hard clip read to the intervals in the interval list (this step may produce multiple reads)
+ *
+ * @param ref default map parameter
+ * @param read default map parameter
+ * @param metaDataTracker default map parameter
+ * @return a linked list with all the reads produced by the clipping operations
+ */
+ @Override
+ public LinkedList map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
+ LinkedList mappedReads;
+ totalReads++;
+ if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
+ System.out.println("Found debug read!");
+
+ if (debugLevel == 1)
+ System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd());
+
+ // we write the actual alignment starts to their respectiv alignment shift tags in the temporary
+ // attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file
+ read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart());
+ read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd());
+
+ if (!DONT_SIMPLIFY_READS)
+ read.simplify(); // Clear all unnecessary attributes
+ if (!DONT_CLIP_ADAPTOR_SEQUENCES)
+ read = ReadClipper.hardClipAdaptorSequence(read); // Strip away adaptor sequences, if any.
+ if (!DONT_CLIP_LOW_QUAL_TAILS)
+ read = ReadClipper.hardClipLowQualEnds(read, minTailQuality); // Clip low quality tails
+ if (!isWholeGenome()) {
+ if (HARD_CLIP_TO_INTERVAL)
+ mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval
+ else {
+ mappedReads = new LinkedList();
+ mappedReads.add(read);
+ }
+ }
+ else {
+ mappedReads = new LinkedList();
+ if (!read.isEmpty())
+ mappedReads.add(read);
+ }
+
+ if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) {
+ LinkedList tempList = new LinkedList();
+ for (GATKSAMRecord mRead : mappedReads) {
+ GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual);
+ if (!clippedRead.isEmpty())
+ tempList.add(clippedRead);
+ }
+ mappedReads = tempList;
+ }
+
+ if (debugLevel == 1)
+ for (GATKSAMRecord mappedRead : mappedReads)
+ System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
+
+ return mappedReads;
+
+ }
+
+ /**
+ * Initializes the ReduceReadsStash that keeps track of all reads that are waiting to
+ * enter the SlidingWindow machinery. The stash makes sure reads are served in order
+ * even though map() may generate reads that are only supposed to enter the machinery
+ * in the future.
+ *
+ * @return the empty stash
+ */
+ @Override
+ public ReduceReadsStash reduceInit() {
+ return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
+ }
+
+ /**
+ * Takes the list of reads produced by map(), adds them to the stash (which keeps them sorted) and process
+ * all reads that come before the original read (the read that was passed to map) including the original
+ * read. This is where we send reads, in order, to the SlidingWindow machinery.
+ *
+ * @param mappedReads the list of reads sent by map
+ * @param stash the stash that keeps the reads in order for processing
+ * @return the stash with all reads that have not been processed yet
+ */
+ public ReduceReadsStash reduce(LinkedList mappedReads, ReduceReadsStash stash) {
+ if (debugLevel == 1)
+ stash.print();
+
+ boolean firstRead = true;
+ for (GATKSAMRecord read : mappedReads) {
+ boolean originalRead = firstRead && isOriginalRead(mappedReads, read);
+
+ if (read.getReadLength() == 0)
+ throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
+
+ if (originalRead) {
+ List readsReady = new LinkedList();
+ readsReady.addAll(stash.getAllReadsBefore(read));
+ readsReady.add(read);
+
+ for (GATKSAMRecord readReady : readsReady) {
+ if (debugLevel == 1)
+ System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
+
+ for (GATKSAMRecord compressedRead : stash.compress(readReady))
+ outputRead(compressedRead);
+
+ }
+ } else
+ stash.add(read);
+
+ firstRead = false;
+ }
+
+ return stash;
+ }
+
+ /**
+ * Now that now more reads will come, we process all the remaining reads in the stash, in order.
+ *
+ * @param stash the ReduceReadsStash with all unprocessed reads (from reduce)
+ */
+ @Override
+ public void onTraversalDone(ReduceReadsStash stash) {
+
+ // output any remaining reads in the compressor
+ for (GATKSAMRecord read : stash.close())
+ outputRead(read);
+ }
+
+ /**
+ * Hard clips away all parts of the read that doesn't agree with the intervals selected.
+ *
+ * Note: If read overlaps more than one interval, it will be hard clipped to all
+ * the intervals it overlaps with
+ *
+ * @param read the read to be hard clipped to the interval.
+ * @return a shallow copy of the read hard clipped to the interval
+ */
+ private LinkedList hardClipReadToInterval(GATKSAMRecord read) {
+ LinkedList clippedReads = new LinkedList();
+
+ GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list)
+
+ boolean originalRead = true; // false if this is the right tail of the original read
+ boolean overlap; // keeps track of the interval that overlapped the original read
+ boolean doneClipping; // triggers an early exit if we are done clipping this read
+
+ if (isWholeGenome())
+ clippedReads.add(read); // if we don't have intervals (wgs) the read goes in unchanged
+
+ for (GenomeLoc interval : intervalList) {
+
+ if (read.isEmpty()) // nothing to do with an empty read (could have been fully clipped before)
+ break;
+
+ GATKSAMRecord clippedRead = null; // this will hold the read clipped to the interval to be added in the end of the switch
+
+ switch (ReadUtils.getReadAndIntervalOverlapType(read, interval)) {
+ case NO_OVERLAP_RIGHT: // no reads on this interval, check the next interval if this is the original read
+ if (!originalRead) // something went wrong if this is the tail of the read
+ throw new ReviewedStingException("tail of the read should never NO_OVERLAP_RIGHT the following interval. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
+ overlap = false;
+ doneClipping = false;
+ break;
+
+
+ case NO_OVERLAP_HARDCLIPPED_RIGHT: // read used to overlap but got hard clipped and doesn't overlap anymore
+ if (originalRead) {
+ overlap = true; // effectively, we have found the read's location and now we are going to try and match it's tail (which happens to be the entire read).
+ clippedRead = GATKSAMRecord.emptyRead(read);
+ } else
+ overlap = false;
+
+ doneClipping = false;
+ break;
+
+ case NO_OVERLAP_CONTIG: // read is in a different contig
+ if (originalRead) { // the original read can be in a bigger contig, but not on a smaller one.
+ if (read.getReferenceIndex() < interval.getContigIndex())
+ throw new ReviewedStingException("read is behind interval list. (contig) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
+ else {
+ overlap = false;
+ doneClipping = false;
+ }
+ } // tail read CANNOT be in a different contig.
+ else {
+ if (read.getReferenceIndex() < interval.getContigIndex()) {
+ overlap = false;
+ doneClipping = true;
+ } else
+ throw new ReviewedStingException("Tail read is in bigger contig than interval traversal. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
+
+ }
+ break;
+
+ case NO_OVERLAP_LEFT:
+ if (originalRead) // if this is the first read this should never happen.
+ throw new ReviewedStingException("original read cannot be behind the first interval. (position) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
+
+ overlap = false;
+ doneClipping = true;
+ break;
+
+ case NO_OVERLAP_HARDCLIPPED_LEFT: // read used to overlap but got hard clipped and doesn't overlap anymore
+ overlap = originalRead; // if this is the original read, we should not advance the interval list, the original overlap was here.
+ doneClipping = true;
+ break;
+
+ case OVERLAP_LEFT: // clip the left tail of the read
+ clippedRead = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStart() - 1);
+
+ overlap = true;
+ doneClipping = true;
+ break;
+
+ case OVERLAP_RIGHT: // clip the right tail of the read and try to match it to the next interval
+ clippedRead = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, interval.getStop() + 1);
+ read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
+
+ overlap = true;
+ doneClipping = false;
+ break;
+
+ case OVERLAP_LEFT_AND_RIGHT: // clip both left and right ends of the read
+ clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, interval.getStart() - 1, interval.getStop() + 1);
+ read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
+
+ overlap = true;
+ doneClipping = false;
+ break;
+
+ case OVERLAP_CONTAINED: // don't do anything to the read
+ clippedRead = read;
+
+ overlap = true;
+ doneClipping = true;
+ break;
+
+ default:
+ throw new ReviewedStingException("interval overlap returned an unknown / unhandled state. If new state was added to intervalOverlap, it should be handled by hardClipReadToInterval.");
+ }
+
+ if (overlap && originalRead)
+ intervalOverlapped = interval;
+
+ if (clippedRead != null) {
+ originalRead = false;
+
+ if (!clippedRead.isEmpty())
+ clippedReads.add(clippedRead); // if the read overlaps the interval entirely within a deletion, it will be entirely clipped off
+ }
+
+ if (doneClipping)
+ break;
+ }
+
+ if (intervalOverlapped != null)
+ intervalList = intervalList.tailSet(intervalOverlapped);
+
+ return clippedReads;
+ }
+
+ /**
+ * Compresses the read name and adds it to output BAM file (reduced BAM)
+ * after performing some quality control
+ *
+ * @param read any read
+ */
+ private void outputRead(GATKSAMRecord read) {
+ if (debugLevel == 2) {
+ checkForHighMismatch(read);
+ checkCigar(read);
+ }
+
+ if (read.isReducedRead())
+ nCompressedReads++;
+ else {
+ int originalAlignmentStart = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT);
+ int originalAlignmentEnd = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT);
+
+ int startShift = originalAlignmentStart - read.getUnclippedStart(); // we annotate the shifts for better compression
+ int endShift = read.getUnclippedEnd() - originalAlignmentEnd; // we annotate the shifts for better compression
+
+ if (startShift > 0)
+ read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start)
+ if (endShift > 0)
+ read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end)
+
+ totalReads++;
+ }
+
+ if (debugLevel == 1)
+ System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd());
+
+// if (!DONT_USE_SOFTCLIPPED_BASES)
+// reSoftClipBases(read);
+
+ if (!DONT_COMPRESS_READ_NAMES)
+ compressReadName(read);
+
+ out.addAlignment(read);
+ }
+
+ private void reSoftClipBases(GATKSAMRecord read) {
+ Integer left = (Integer) read.getTemporaryAttribute("SL");
+ Integer right = (Integer) read.getTemporaryAttribute("SR");
+ if (left != null || right != null) {
+ Cigar newCigar = new Cigar();
+ for (CigarElement element : read.getCigar().getCigarElements()) {
+ newCigar.add(new CigarElement(element.getLength(), element.getOperator()));
+ }
+
+ if (left != null) {
+ newCigar = updateFirstSoftClipCigarElement(left, newCigar);
+ read.setAlignmentStart(read.getAlignmentStart() + left);
+ }
+
+ if (right != null) {
+ Cigar invertedCigar = invertCigar(newCigar);
+ newCigar = invertCigar(updateFirstSoftClipCigarElement(right, invertedCigar));
+ }
+ read.setCigar(newCigar);
+ }
+ }
+
+ /**
+ * Facility routine to revert the first element of a Cigar string (skipping hard clips) into a soft-clip.
+ * To be used on both ends if provided a flipped Cigar
+ *
+ * @param softClipSize the length of the soft clipped element to add
+ * @param originalCigar the original Cigar string
+ * @return a new Cigar object with the soft clips added
+ */
+ private Cigar updateFirstSoftClipCigarElement (int softClipSize, Cigar originalCigar) {
+ Cigar result = new Cigar();
+ CigarElement leftElement = new CigarElement(softClipSize, CigarOperator.S);
+ boolean updated = false;
+ for (CigarElement element : originalCigar.getCigarElements()) {
+ if (!updated && element.getOperator() == CigarOperator.M) {
+ result.add(leftElement);
+ int newLength = element.getLength() - softClipSize;
+ if (newLength > 0)
+ result.add(new CigarElement(newLength, CigarOperator.M));
+ updated = true;
+ }
+ else
+ result.add(element);
+ }
+ return result;
+ }
+
+ /**
+ * Given a cigar string, returns the inverted cigar string.
+ *
+ * @param cigar the original cigar
+ * @return the inverted cigar
+ */
+ private Cigar invertCigar(Cigar cigar) {
+ Stack stack = new Stack();
+ for (CigarElement e : cigar.getCigarElements())
+ stack.push(e);
+ Cigar inverted = new Cigar();
+ while (!stack.empty()) {
+ inverted.add(stack.pop());
+ }
+ return inverted;
+ }
+
+
+ /**
+ * Quality control procedure that checks if the consensus reads contains too many
+ * mismatches with the reference. This should never happen and is a good trigger for
+ * errors with the algorithm.
+ *
+ * @param read any read
+ */
+ private void checkForHighMismatch(GATKSAMRecord read) {
+ final int start = read.getAlignmentStart();
+ final int stop = read.getAlignmentEnd();
+ final byte[] ref = getToolkit().getReferenceDataSource().getReference().getSubsequenceAt(read.getReferenceName(), start, stop).getBases();
+ final int nm = SequenceUtil.countMismatches(read, ref, start - 1);
+ final int readLen = read.getReadLength();
+ final double nmFraction = nm / (1.0 * readLen);
+ if (nmFraction > 0.4 && readLen > 20 && read.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG) != null && read.getReadName().startsWith("Consensus"))
+ throw new ReviewedStingException("BUG: High mismatch fraction found in read " + read.getReadName() + " position: " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
+ }
+
+ private void checkCigar (GATKSAMRecord read) {
+ if (read.getCigar().isValid(null, -1) != null) {
+ throw new ReviewedStingException("BUG: cigar string is not valid: " + read.getCigarString());
+ }
+
+ }
+
+
+ /**
+ * Compresses the read name using the readNameHash if we have already compressed
+ * this read name before.
+ *
+ * @param read any read
+ */
+ private void compressReadName(GATKSAMRecord read) {
+ String name = read.getReadName();
+ String compressedName = read.isReducedRead() ? "C" : "";
+ if (readNameHash.containsKey(name))
+ compressedName += readNameHash.get(name).toString();
+ else {
+ readNameHash.put(name, nextReadNumber);
+ compressedName += nextReadNumber.toString();
+ nextReadNumber++;
+ }
+
+ read.setReadName(compressedName);
+ }
+
+ /**
+ * Returns true if the read is the original read that went through map().
+ *
+ * This is important to know so we can decide what reads to pull from the stash. Only reads that came before the original read should be pulled.
+ *
+ * @param list the list
+ * @param read the read
+ * @return Returns true if the read is the original read that went through map().
+ */
+ private boolean isOriginalRead(LinkedList list, GATKSAMRecord read) {
+ return isWholeGenome() || list.getFirst().equals(read);
+ }
+
+ /**
+ * Checks whether or not the intervalList is empty, meaning we're running in WGS mode.
+ *
+ * @return whether or not we're running in WGS mode.
+ */
+ private boolean isWholeGenome() {
+ return intervalList.isEmpty();
+ }
+
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
new file mode 100644
index 000000000..593047504
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java
@@ -0,0 +1,110 @@
+package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+
+import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+/**
+ * This class implements a "read stash" that keeps reads always sorted in alignment order. Useful
+ * for read walkers that alter the alignment information of the incoming reads, but need to
+ * maintain the reads sorted for the reduce step. (e.g. ReduceReads)
+ */
+
+public class ReduceReadsStash {
+ protected MultiSampleCompressor compressor;
+ SortedSet outOfOrderReads;
+
+ /**
+ * Creates a stash with the default sorting order (read alignment)
+ * @param compressor the MultiSampleCompressor object to be used with this stash (for stash.close())
+ */
+ public ReduceReadsStash(MultiSampleCompressor compressor) {
+ this.compressor = compressor;
+ this.outOfOrderReads = new TreeSet(new AlignmentStartWithNoTiesComparator());
+ }
+
+ /**
+ * Get all reads before a given read (for processing)
+ *
+ * @param read the original read
+ * @return all reads that have alignment start before the original read.
+ */
+ public List getAllReadsBefore(GATKSAMRecord read) {
+ List result = new LinkedList();
+ GATKSAMRecord newHead = null;
+
+ for (GATKSAMRecord stashedRead : outOfOrderReads) {
+ if (ReadUtils.compareSAMRecords(stashedRead, read) <= 0)
+ result.add(stashedRead);
+ else {
+ newHead = stashedRead;
+ break;
+ }
+ }
+
+ if (result.size() > 0) {
+ if (result.size() == outOfOrderReads.size())
+ outOfOrderReads.clear();
+ else
+ outOfOrderReads = new TreeSet(outOfOrderReads.tailSet(newHead));
+ }
+
+ return result;
+ }
+
+ /**
+ * sends the read to the MultiSampleCompressor
+ *
+ * @param read the read to be compressed
+ * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
+ */
+ public Iterable compress(GATKSAMRecord read) {
+ return compressor.addAlignment(read);
+ }
+
+ /**
+ * Add a read to the stash
+ *
+ * @param read any read
+ */
+ public void add(GATKSAMRecord read) {
+ outOfOrderReads.add(read);
+ }
+
+ /**
+ * Close the stash, processing all remaining reads in order
+ *
+ * @return a list of all the reads produced by the SlidingWindow machinery)
+ */
+ public Iterable close() {
+ LinkedList result = new LinkedList();
+
+ // compress all the stashed reads (in order)
+ for (GATKSAMRecord read : outOfOrderReads)
+ for (GATKSAMRecord compressedRead : compressor.addAlignment(read))
+ result.add(compressedRead);
+
+ // output any remaining reads from the compressor
+ for (GATKSAMRecord read : compressor.close())
+ result.add(read);
+
+ return result;
+ }
+
+ /**
+ * Useful debug functionality, outputs all elements in the stash
+ */
+ public void print() {
+ int i = 1;
+ System.out.println("Stash Contents:");
+ for (GATKSAMRecord read : outOfOrderReads)
+ System.out.println(String.format("%3d: %s %d %d", i++, read.getCigarString(), read.getAlignmentStart(), read.getAlignmentEnd()));
+ System.out.println();
+ }
+
+}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
new file mode 100644
index 000000000..6d2c2d215
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
@@ -0,0 +1,83 @@
+package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.util.TreeSet;
+
+/**
+ *
+ * @author depristo
+ * @version 0.1
+ */
+public class SingleSampleCompressor implements Compressor {
+ protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class);
+
+ protected final int contextSize;
+ protected final int downsampleCoverage;
+ protected int minMappingQuality;
+ protected int slidingWindowCounter;
+
+ protected final String sampleName;
+
+ protected SlidingWindow slidingWindow;
+ protected double minAltProportionToTriggerVariant;
+ protected double minIndelProportionToTriggerVariant;
+ protected int minBaseQual;
+
+ protected ReduceReads.DownsampleStrategy downsampleStrategy;
+
+ public SingleSampleCompressor(final String sampleName,
+ final int contextSize,
+ final int downsampleCoverage,
+ final int minMappingQuality,
+ final double minAltProportionToTriggerVariant,
+ final double minIndelProportionToTriggerVariant,
+ final int minBaseQual,
+ final ReduceReads.DownsampleStrategy downsampleStrategy) {
+ this.sampleName = sampleName;
+ this.contextSize = contextSize;
+ this.downsampleCoverage = downsampleCoverage;
+ this.minMappingQuality = minMappingQuality;
+ this.slidingWindowCounter = 0;
+ this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
+ this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
+ this.minBaseQual = minBaseQual;
+ this.downsampleStrategy = downsampleStrategy;
+ }
+
+ /**
+ * @{inheritDoc}
+ */
+ @Override
+ public Iterable addAlignment( GATKSAMRecord read ) {
+ TreeSet result = new TreeSet(new AlignmentStartWithNoTiesComparator());
+ int readOriginalStart = read.getUnclippedStart();
+
+ // create a new window if:
+ if ((slidingWindow != null) &&
+ ( ( read.getReferenceIndex() != slidingWindow.getContigIndex() ) || // this is a brand new contig
+ (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
+
+ // close the current sliding window
+ result.addAll(slidingWindow.close());
+ slidingWindow = null; // so we create a new one on the next if
+ }
+
+ if ( slidingWindow == null) { // this is the first read
+ slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
+ slidingWindowCounter++;
+ }
+
+ result.addAll(slidingWindow.addRead(read));
+ return result;
+ }
+
+ @Override
+ public Iterable close() {
+ return (slidingWindow != null) ? slidingWindow.close() : new TreeSet();
+ }
+
+}
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
new file mode 100644
index 000000000..60d7096a3
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
@@ -0,0 +1,713 @@
+package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+
+import com.google.java.contract.Requires;
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import net.sf.samtools.SAMFileHeader;
+import org.broadinstitute.sting.gatk.downsampling.FractionalDownsampler;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: roger
+ * Date: 8/3/11
+ * Time: 2:24 PM
+ */
+public class SlidingWindow {
+
+ // Sliding Window data
+ final private LinkedList readsInWindow;
+ final private LinkedList windowHeader;
+ protected int contextSize; // the largest context size (between mismatches and indels)
+ protected int stopLocation;
+ protected String contig;
+ protected int contigIndex;
+ protected SAMFileHeader header;
+ protected GATKSAMReadGroupRecord readGroupAttribute;
+ protected int downsampleCoverage;
+
+ // Running consensus data
+ protected SyntheticRead runningConsensus;
+ protected int consensusCounter;
+ protected String consensusReadName;
+
+ // Filtered Data Consensus data
+ protected SyntheticRead filteredDataConsensus;
+ protected int filteredDataConsensusCounter;
+ protected String filteredDataReadName;
+
+
+ // Additional parameters
+ protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches
+ protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions
+ protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value
+ protected int MIN_MAPPING_QUALITY;
+
+ protected ReduceReads.DownsampleStrategy downsampleStrategy;
+ private boolean hasIndelQualities;
+
+ /**
+ * The types of synthetic reads to use in the finalizeAndAdd method
+ */
+ private enum ConsensusType {
+ CONSENSUS,
+ FILTERED,
+ BOTH
+ }
+
+ public int getStopLocation() {
+ return stopLocation;
+ }
+
+ public String getContig() {
+ return contig;
+ }
+
+ public int getContigIndex() {
+ return contigIndex;
+ }
+
+ public int getStartLocation() {
+ return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation();
+ }
+
+
+ public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) {
+ this.stopLocation = -1;
+ this.contextSize = contextSize;
+ this.downsampleCoverage = downsampleCoverage;
+
+ this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
+ this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant;
+ this.MIN_BASE_QUAL_TO_COUNT = minBaseQual;
+ this.MIN_MAPPING_QUALITY = minMappingQuality;
+
+ this.windowHeader = new LinkedList();
+ this.readsInWindow = new LinkedList();
+
+ this.contig = contig;
+ this.contigIndex = contigIndex;
+ this.header = header;
+ this.readGroupAttribute = readGroupAttribute;
+
+ this.consensusCounter = 0;
+ this.consensusReadName = "Consensus-" + windowNumber + "-";
+
+ this.filteredDataConsensusCounter = 0;
+ this.filteredDataReadName = "Filtered-" + windowNumber + "-";
+
+ this.runningConsensus = null;
+ this.filteredDataConsensus = null;
+
+ this.downsampleStrategy = downsampleStrategy;
+ this.hasIndelQualities = hasIndelQualities;
+ }
+
+ /**
+ * Add a read to the sliding window and slides the window accordingly.
+ *
+ * Reads are assumed to be in order, therefore, when a read is added the sliding window can
+ * assume that no more reads will affect read.getUnclippedStart() - contextSizeMismatches. The window
+ * slides forward to that position and returns all reads that may have been finalized in the
+ * sliding process.
+ *
+ * @param read the read
+ * @return a list of reads that have been finished by sliding the window.
+ */
+ public List addRead(GATKSAMRecord read) {
+ updateHeaderCounts(read, false); // update the window header counts
+ readsInWindow.add(read); // add read to sliding reads
+ return slideWindow(read.getUnclippedStart());
+ }
+
+ /**
+ * returns the next complete or incomplete variant region between 'from' (inclusive) and 'to' (exclusive)
+ *
+ * @param from beginning window header index of the search window (inclusive)
+ * @param to end window header index of the search window (exclusive)
+ * @param variantSite boolean array with true marking variant regions
+ * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
+ */
+ private Pair getNextVariantRegion(int from, int to, boolean[] variantSite) {
+ boolean foundStart = false;
+ int variantRegionStartIndex = 0;
+ for (int i=from; i(variantRegionStartIndex, i-1));
+ }
+ }
+ return (foundStart) ? new Pair(variantRegionStartIndex, -1) : null;
+ }
+
+ /**
+ * Creates a list with all the complete and incomplete variant regions within 'from' (inclusive) and 'to' (exclusive)
+ *
+ * @param from beginning window header index of the search window (inclusive)
+ * @param to end window header index of the search window (exclusive)
+ * @param variantSite boolean array with true marking variant regions
+ * @return a list with start/stops of variant regions following getNextVariantRegion description
+ */
+ private List> getAllVariantRegions(int from, int to, boolean[] variantSite) {
+ List> regions = new LinkedList>();
+ int index = from;
+ while(index < to) {
+ Pair result = getNextVariantRegion(index, to, variantSite);
+ if (result == null)
+ break;
+
+ regions.add(result);
+ if (result.getSecond() < 0)
+ break;
+ index = result.getSecond() + 1;
+ }
+ return regions;
+ }
+
+
+ /**
+ * Determines if the window can be slid given the new incoming read.
+ *
+ * We check from the start of the window to the (unclipped) start of the new incoming read if there
+ * is any variant.
+ * If there are variant sites, we check if it's time to close the variant region.
+ *
+ * @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
+ * @return all reads that have fallen to the left of the sliding window after the slide
+ */
+ protected List slideWindow(int incomingReadUnclippedStart) {
+ List finalizedReads = new LinkedList();
+
+ if (incomingReadUnclippedStart - contextSize > getStartLocation()) {
+ int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation();
+ boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex);
+ int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
+
+ List> regions = getAllVariantRegions(0, breakpoint, variantSite);
+ finalizedReads = closeVariantRegions(regions, false);
+
+ List readsToRemove = new LinkedList();
+ for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
+ if (read.getAlignmentEnd() < getStartLocation()) {
+ readsToRemove.add(read);
+ }
+ }
+ for (GATKSAMRecord read : readsToRemove) {
+ readsInWindow.remove(read);
+ }
+ }
+
+ return finalizedReads;
+ }
+
+ /**
+ * returns an array marked with variant and non-variant regions (it uses
+ * markVariantRegions to make the marks)
+ *
+ * @param stop check the window from start to stop (not-inclusive)
+ * @return a boolean array with 'true' marking variant regions and false marking consensus sites
+ */
+ protected boolean[] markSites(int stop) {
+
+ boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1];
+
+ Iterator headerElementIterator = windowHeader.iterator();
+ for (int i = getStartLocation(); i < stop; i++) {
+ if (headerElementIterator.hasNext()) {
+ HeaderElement headerElement = headerElementIterator.next();
+
+ if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
+ markVariantRegion(markedSites, i - getStartLocation());
+
+ } else
+ break;
+ }
+ return markedSites;
+ }
+
+ /**
+ * Marks the sites around the variant site (as true)
+ *
+ * @param markedSites the boolean array to bear the marks
+ * @param variantSiteLocation the location where a variant site was found
+ */
+ protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) {
+ int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
+ int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1;
+ for (int i = from; i < to; i++)
+ markedSites[i] = true;
+ }
+
+ /**
+ * Adds bases to the running consensus or filtered data accordingly
+ *
+ * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
+ *
+ * @param start the first header index to add to consensus
+ * @param end the first header index NOT TO add to consensus
+ * @return a list of consensus reads generated by this call. Empty list if no consensus was generated.
+ */
+ protected List addToSyntheticReads(int start, int end) {
+ LinkedList reads = new LinkedList();
+ if (start < end) {
+
+ ListIterator headerElementIterator = windowHeader.listIterator(start);
+
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end));
+
+ HeaderElement headerElement = headerElementIterator.next();
+
+ if (headerElement.hasConsensusData()) {
+ reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
+
+ int endOfConsensus = findNextNonConsensusElement(start, end);
+ addToRunningConsensus(start, endOfConsensus);
+
+ if (endOfConsensus <= start)
+ throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
+
+ reads.addAll(addToSyntheticReads(endOfConsensus, end));
+ } else if (headerElement.hasFilteredData()) {
+ reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
+
+ int endOfFilteredData = findNextNonFilteredDataElement(start, end);
+ addToFilteredData(start, endOfFilteredData);
+
+ if (endOfFilteredData <= start)
+ throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
+
+ reads.addAll(addToSyntheticReads(endOfFilteredData, end));
+ } else if (headerElement.isEmpty()) {
+ reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
+
+ int endOfEmptyData = findNextNonEmptyElement(start, end);
+
+ if (endOfEmptyData <= start)
+ throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
+
+ reads.addAll(addToSyntheticReads(endOfEmptyData, end));
+ } else
+ throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
+
+ }
+
+ return reads;
+ }
+
+ /**
+ * Finalizes one or more synthetic reads.
+ *
+ * @param type the synthetic reads you want to close
+ * @return the GATKSAMRecords generated by finalizing the synthetic reads
+ */
+ private List finalizeAndAdd(ConsensusType type) {
+ GATKSAMRecord read = null;
+ List list = new LinkedList();
+
+ switch (type) {
+ case CONSENSUS:
+ read = finalizeRunningConsensus();
+ break;
+ case FILTERED:
+ read = finalizeFilteredDataConsensus();
+ break;
+ case BOTH:
+ read = finalizeRunningConsensus();
+ if (read != null) list.add(read);
+ read = finalizeFilteredDataConsensus();
+ }
+ if (read != null)
+ list.add(read);
+
+ return list;
+ }
+
+ /**
+ * Looks for the next position without consensus data
+ *
+ * @param start beginning of the filtered region
+ * @param upTo limit to search for another consensus element
+ * @return next position with consensus data or empty
+ */
+ private int findNextNonConsensusElement(int start, int upTo) {
+ Iterator headerElementIterator = windowHeader.listIterator(start);
+ int index = start;
+ while (index < upTo) {
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException("There are no more header elements in this window");
+
+ HeaderElement headerElement = headerElementIterator.next();
+ if (!headerElement.hasConsensusData())
+ break;
+ index++;
+ }
+ return index;
+ }
+
+ /**
+ * Looks for the next position without filtered data
+ *
+ * @param start beginning of the region
+ * @param upTo limit to search for
+ * @return next position with no filtered data
+ */
+ private int findNextNonFilteredDataElement(int start, int upTo) {
+ Iterator headerElementIterator = windowHeader.listIterator(start);
+ int index = start;
+ while (index < upTo) {
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException("There are no more header elements in this window");
+
+ HeaderElement headerElement = headerElementIterator.next();
+ if (!headerElement.hasFilteredData() || headerElement.hasConsensusData())
+ break;
+ index++;
+ }
+ return index;
+ }
+
+ /**
+ * Looks for the next non-empty header element
+ *
+ * @param start beginning of the region
+ * @param upTo limit to search for
+ * @return next position with non-empty element
+ */
+ private int findNextNonEmptyElement(int start, int upTo) {
+ ListIterator headerElementIterator = windowHeader.listIterator(start);
+ int index = start;
+ while (index < upTo) {
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException("There are no more header elements in this window");
+
+ HeaderElement headerElement = headerElementIterator.next();
+ if (!headerElement.isEmpty())
+ break;
+ index++;
+ }
+ return index;
+ }
+
+
+ /**
+ * Adds bases to the filtered data synthetic read.
+ *
+ * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
+ * bases.
+ *
+ * @param start the first header index to add to consensus
+ * @param end the first header index NOT TO add to consensus
+ */
+ private void addToFilteredData(int start, int end) {
+ if (filteredDataConsensus == null)
+ filteredDataConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
+
+ ListIterator headerElementIterator = windowHeader.listIterator(start);
+ for (int index = start; index < end; index++) {
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
+
+ HeaderElement headerElement = headerElementIterator.next();
+ if (headerElement.hasConsensusData())
+ throw new ReviewedStingException("Found consensus data inside region to add to filtered data.");
+
+ if (!headerElement.hasFilteredData())
+ throw new ReviewedStingException("No filtered data in " + index);
+
+ genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
+ }
+ }
+
+ /**
+ * Adds bases to the filtered data synthetic read.
+ *
+ * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
+ * bases.
+ *
+ * @param start the first header index to add to consensus
+ * @param end the first header index NOT TO add to consensus
+ */
+ private void addToRunningConsensus(int start, int end) {
+ if (runningConsensus == null)
+ runningConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
+
+ Iterator headerElementIterator = windowHeader.listIterator(start);
+ for (int index = start; index < end; index++) {
+ if (!headerElementIterator.hasNext())
+ throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
+
+ HeaderElement headerElement = headerElementIterator.next();
+ if (!headerElement.hasConsensusData())
+ throw new ReviewedStingException("No CONSENSUS data in " + index);
+
+ genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS());
+ }
+ }
+
+ /**
+ * Generic accessor to add base and qualities to a synthetic read
+ *
+ * @param syntheticRead the synthetic read to add to
+ * @param baseCounts the base counts object in the header element
+ * @param rms the rms mapping quality in the header element
+ */
+ private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
+ BaseIndex base = baseCounts.baseIndexWithMostCounts();
+ byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
+ byte qual = baseCounts.averageQualsOfMostCommonBase();
+ byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
+ byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
+ syntheticRead.add(base, count, qual, insQual, delQual, rms);
+ }
+
+ /**
+ * Finalizes a variant region, any adjacent synthetic reads.
+ *
+ * @param start the first window header index in the variant region (inclusive)
+ * @param stop the last window header index of the variant region (inclusive)
+ * @return all reads contained in the variant region plus any adjacent synthetic reads
+ */
+ @Requires("start <= stop")
+ protected List closeVariantRegion(int start, int stop) {
+ List allReads = new LinkedList();
+
+ int refStart = windowHeader.get(start).getLocation(); // All operations are reference based, not read based
+ int refStop = windowHeader.get(stop).getLocation();
+
+ for (GATKSAMRecord read : readsInWindow) { // Keep all reads that overlap the variant region
+ if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) {
+ allReads.add(read);
+ updateHeaderCounts(read, true); // Remove this read from the window header entirely
+ }
+ }
+
+ List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
+ result.addAll(addToSyntheticReads(0, start));
+ result.addAll(finalizeAndAdd(ConsensusType.BOTH));
+
+ for (GATKSAMRecord read : result) {
+ readsInWindow.remove(read); // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts.
+ }
+
+ return result; // finalized reads will be downsampled if necessary
+ }
+
+
+ private List closeVariantRegions(List> regions, boolean forceClose) {
+ List allReads = new LinkedList();
+ if (!regions.isEmpty()) {
+ int lastStop = -1;
+ for (Pair region : regions) {
+ int start = region.getFirst();
+ int stop = region.getSecond();
+ if (stop < 0 && forceClose)
+ stop = windowHeader.size() - 1;
+ if (stop >= 0) {
+ allReads.addAll(closeVariantRegion(start, stop));
+ lastStop = stop;
+ }
+ }
+ for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
+ windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
+ }
+ return allReads;
+ }
+
+ /**
+ * Downsamples a variant region to the downsample coverage of the sliding window.
+ *
+ * It will use the downsampling strategy defined by the SlidingWindow
+ *
+ * @param allReads the reads to select from (all reads that cover the window)
+ * @return a list of reads selected by the downsampler to cover the window to at least the desired coverage
+ */
+ protected List downsampleVariantRegion(final List allReads) {
+ double fraction = 100 / allReads.size();
+ if (fraction >= 1)
+ return allReads;
+
+ FractionalDownsampler downsampler = new FractionalDownsampler(fraction);
+ downsampler.submit(allReads);
+ return downsampler.consumeDownsampledItems();
+ }
+
+ /**
+ * Properly closes a Sliding Window, finalizing all consensus and variant
+ * regions that still exist regardless of being able to fulfill the
+ * context size requirement in the end.
+ *
+ * @return All reads generated
+ */
+ public List close() {
+ // mark variant regions
+ List finalizedReads = new LinkedList();
+
+ if (!windowHeader.isEmpty()) {
+ boolean[] variantSite = markSites(stopLocation + 1);
+ List> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
+ finalizedReads = closeVariantRegions(regions, true);
+
+ if (!windowHeader.isEmpty()) {
+ finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1));
+ finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
+ }
+
+ }
+ return finalizedReads;
+ }
+
+ /**
+ * generates the SAM record for the running consensus read and resets it (to null)
+ *
+ * @return the read contained in the running consensus
+ */
+ protected GATKSAMRecord finalizeRunningConsensus() {
+ GATKSAMRecord finalizedRead = null;
+ if (runningConsensus != null) {
+ if (runningConsensus.size() > 0)
+ finalizedRead = runningConsensus.close();
+ else
+ consensusCounter--;
+
+ runningConsensus = null;
+ }
+ return finalizedRead;
+ }
+
+ /**
+ * generates the SAM record for the filtered data consensus and resets it (to null)
+ *
+ * @return the read contained in the running consensus
+ */
+ protected GATKSAMRecord finalizeFilteredDataConsensus() {
+ GATKSAMRecord finalizedRead = null;
+ if (filteredDataConsensus != null) {
+ if (filteredDataConsensus.size() > 0)
+ finalizedRead = filteredDataConsensus.close();
+ else
+ filteredDataConsensusCounter--;
+
+ filteredDataConsensus = null;
+ }
+ return finalizedRead;
+ }
+
+
+ /**
+ * Updates the sliding window's header counts with the incoming read bases, insertions
+ * and deletions.
+ *
+ * @param read the incoming read to be added to the sliding window
+ */
+ protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) {
+ byte[] bases = read.getReadBases();
+ byte[] quals = read.getBaseQualities();
+ byte[] insQuals = read.getExistingBaseInsertionQualities();
+ byte[] delQuals = read.getExistingBaseDeletionQualities();
+ int readStart = read.getSoftStart();
+ int readEnd = read.getSoftEnd();
+ Cigar cigar = read.getCigar();
+
+ int readBaseIndex = 0;
+ int startLocation = getStartLocation();
+ int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
+
+ if (removeRead && locationIndex < 0)
+ throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
+
+ if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it
+ if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
+ for (int i = 1; i <= -locationIndex; i++)
+ windowHeader.addFirst(new HeaderElement(startLocation - i));
+
+ startLocation = readStart; // update start location accordingly
+ locationIndex = 0;
+ }
+
+ if (stopLocation < readEnd) { // Do we need to add extra elements to the header?
+ int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
+ while (elementsToAdd-- > 0)
+ windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd));
+
+ stopLocation = readEnd; // update stopLocation accordingly
+ }
+
+ // Special case for leading insertions before the beginning of the sliding read
+ if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
+ windowHeader.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added
+ locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
+ }
+ }
+
+ Iterator headerElementIterator = windowHeader.listIterator(locationIndex);
+ HeaderElement headerElement;
+ for (CigarElement cigarElement : cigar.getCigarElements()) {
+ switch (cigarElement.getOperator()) {
+ case H:
+ break;
+ case I:
+ if (removeRead && locationIndex == 0) { // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it.
+ break;
+ }
+
+ headerElement = windowHeader.get(locationIndex - 1); // insertions are added to the base to the left (previous element)
+
+ if (removeRead) {
+ headerElement.removeInsertionToTheRight();
+ }
+ else {
+ headerElement.addInsertionToTheRight();
+ }
+ readBaseIndex += cigarElement.getLength();
+ break; // just ignore the insertions at the beginning of the read
+ case D:
+ int nDeletions = cigarElement.getLength();
+ while (nDeletions-- > 0) { // deletions are added to the baseCounts with the read mapping quality as it's quality score
+ headerElement = headerElementIterator.next();
+ byte mq = (byte) read.getMappingQuality();
+ if (removeRead)
+ headerElement.removeBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
+ else
+ headerElement.addBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
+
+ locationIndex++;
+ }
+ break;
+ case S:
+ case M:
+ case P:
+ case EQ:
+ case X:
+ int nBasesToAdd = cigarElement.getLength();
+ while (nBasesToAdd-- > 0) {
+ headerElement = headerElementIterator.next();
+ byte insertionQuality = insQuals == null ? -1 : insQuals[readBaseIndex]; // if the read doesn't have indel qualities, use -1 (doesn't matter the value because it won't be used for anything)
+ byte deletionQuality = delQuals == null ? -1 : delQuals[readBaseIndex];
+ if (removeRead)
+ headerElement.removeBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
+ else
+ headerElement.addBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
+
+ readBaseIndex++;
+ locationIndex++;
+ }
+ break;
+ }
+ }
+ }
+}
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
new file mode 100644
index 000000000..9ee1a4634
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
@@ -0,0 +1,285 @@
+package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+
+import com.google.java.contract.Requires;
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import net.sf.samtools.SAMFileHeader;
+import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Running Consensus is a read that is compressed as a sliding window travels over the reads
+ * and keeps track of all the bases that are outside of variant regions.
+ *
+ * Consensus reads have qual fields that correspond to the number of reads that had the base
+ * and passed the minimum quality threshold.
+ *
+ * The mapping quality of a consensus read is the average RMS of the mapping qualities of all reads
+ * that compose the consensus
+ *
+ * @author Mauricio Carneiro
+ * @since 8/26/11
+ */
+public class SyntheticRead {
+ private List bases;
+ private List counts;
+ private List quals;
+ private List insertionQuals;
+ private List deletionQuals;
+ private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus
+ private String readTag;
+
+ // Information to produce a GATKSAMRecord
+ private SAMFileHeader header;
+ private GATKSAMReadGroupRecord readGroupRecord;
+ private String contig;
+ private int contigIndex;
+ private String readName;
+ private Integer refStart;
+ private boolean hasIndelQualities = false;
+
+ /**
+ * Full initialization of the running consensus if you have all the information and are ready to
+ * start adding to the running consensus.
+ *
+ * @param header GATKSAMRecord file header
+ * @param readGroupRecord Read Group for the GATKSAMRecord
+ * @param contig the read's contig name
+ * @param contigIndex the read's contig index
+ * @param readName the read's name
+ * @param refStart the alignment start (reference based)
+ * @param readTag the reduce reads tag for the synthetic read
+ */
+ public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities) {
+ final int initialCapacity = 10000;
+ bases = new ArrayList(initialCapacity);
+ counts = new ArrayList(initialCapacity);
+ quals = new ArrayList(initialCapacity);
+ insertionQuals = new ArrayList(initialCapacity);
+ deletionQuals = new ArrayList(initialCapacity);
+ mappingQuality = 0.0;
+
+ this.readTag = readTag;
+ this.header = header;
+ this.readGroupRecord = readGroupRecord;
+ this.contig = contig;
+ this.contigIndex = contigIndex;
+ this.readName = readName;
+ this.refStart = refStart;
+ this.hasIndelQualities = hasIndelQualities;
+ }
+
+ public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities) {
+ this.bases = bases;
+ this.counts = counts;
+ this.quals = quals;
+ this.insertionQuals = insertionQuals;
+ this.deletionQuals = deletionQuals;
+ this.mappingQuality = mappingQuality;
+ this.readTag = readTag;
+ this.header = header;
+ this.readGroupRecord = readGroupRecord;
+ this.contig = contig;
+ this.contigIndex = contigIndex;
+ this.readName = readName;
+ this.refStart = refStart;
+ this.hasIndelQualities = hasIndelQualities;
+ }
+
+ /**
+ * Easy access to keep adding to a running consensus that has already been
+ * initialized with the correct read name and refStart
+ *
+ * @param base the base to add
+ * @param count number of reads with this base
+ */
+ @Requires("count < Byte.MAX_VALUE")
+ public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) {
+ counts.add(count);
+ bases.add(base);
+ quals.add(qual);
+ insertionQuals.add(insQual);
+ deletionQuals.add(delQual);
+ this.mappingQuality += mappingQuality;
+ }
+
+ public BaseIndex getBase(int readCoordinate) {
+ return bases.get(readCoordinate);
+ }
+
+ /**
+ * Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
+ *
+ * Invalid reads are :
+ * - exclusively composed of deletions
+ *
+ * @return a GATKSAMRecord or null
+ */
+ public GATKSAMRecord close () {
+ if (isAllDeletions())
+ return null;
+
+ GATKSAMRecord read = new GATKSAMRecord(header);
+ read.setReferenceName(contig);
+ read.setReferenceIndex(contigIndex);
+ read.setReadPairedFlag(false);
+ read.setReadUnmappedFlag(false);
+ read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
+ read.setAlignmentStart(refStart);
+ read.setReadName(readName);
+ read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
+ read.setReadBases(convertReadBases());
+ read.setMappingQuality((int) Math.ceil(mappingQuality / bases.size()));
+ read.setReadGroup(readGroupRecord);
+ read.setAttribute(readTag, convertBaseCounts());
+
+ if (hasIndelQualities) {
+ read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);
+ read.setBaseQualities(convertDeletionQualities(), EventType.BASE_DELETION);
+ }
+
+ return read;
+ }
+
+ /**
+ * Checks if the synthetic read is composed exclusively of deletions
+ *
+ * @return true if it is, false if it isn't.
+ */
+ private boolean isAllDeletions() {
+ for (BaseIndex b : bases)
+ if (b != BaseIndex.D)
+ return false;
+ return true;
+ }
+
+ public int size () {
+ return bases.size();
+ }
+
+ private byte [] convertBaseQualities() {
+ return convertVariableGivenBases(bases, quals);
+ }
+
+ private byte [] convertInsertionQualities() {
+ return convertVariableGivenBases(bases, insertionQuals);
+ }
+
+ private byte [] convertDeletionQualities() {
+ return convertVariableGivenBases(bases, deletionQuals);
+ }
+
+ protected byte [] convertBaseCounts() {
+ byte[] countsArray = convertVariableGivenBases(bases, counts);
+
+ if (countsArray.length == 0)
+ throw new ReviewedStingException("Reduced read has counts array of length 0");
+
+ byte[] compressedCountsArray = new byte [countsArray.length];
+ compressedCountsArray[0] = countsArray[0];
+ for (int i = 1; i < countsArray.length; i++)
+ compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE);
+
+ return compressedCountsArray;
+ }
+
+ private byte [] convertReadBases() {
+ byte [] readArray = new byte[getReadLengthWithNoDeletions(bases)];
+ int i = 0;
+ for (BaseIndex baseIndex : bases)
+ if (baseIndex != BaseIndex.D)
+ readArray[i++] = baseIndex.getByte();
+
+ return readArray;
+ }
+
+ /**
+ * Builds the cigar string for the synthetic read
+ *
+ * Warning: if the synthetic read has leading deletions, it will shift the refStart (alignment start) of the read.
+ *
+ * @return the cigar string for the synthetic read
+ */
+ private Cigar buildCigar() {
+ LinkedList cigarElements = new LinkedList();
+ CigarOperator cigarOperator = null;
+ int length = 0;
+ for (BaseIndex b : bases) {
+ CigarOperator op;
+ switch (b) {
+ case D:
+ op = CigarOperator.DELETION;
+ break;
+ case I:
+ throw new ReviewedStingException("Trying to create an insertion in a synthetic read. This operation is currently unsupported.");
+ default:
+ op = CigarOperator.MATCH_OR_MISMATCH;
+ break;
+ }
+ if (cigarOperator == null) {
+ if (op == CigarOperator.D) // read cannot start with a deletion
+ refStart++; // if it does, we need to move the reference start forward
+ else
+ cigarOperator = op;
+ }
+ else if (cigarOperator != op) { // if this is a new operator, we need to close the previous one
+ cigarElements.add(new CigarElement(length, cigarOperator)); // close previous operator
+ cigarOperator = op;
+ length = 0;
+ }
+
+ if (cigarOperator != null) // only increment the length of the cigar element if we really added it to the read (no leading deletions)
+ length++;
+ }
+ if (length > 0 && cigarOperator != CigarOperator.D) // read cannot end with a deletion
+ cigarElements.add(new CigarElement(length, cigarOperator)); // add the last cigar element
+
+ return new Cigar(cigarElements);
+ }
+
+ /**
+ * Shared functionality for all conversion utilities
+ *
+ * @param bases the read bases
+ * @param variable the list to convert
+ * @return a converted variable given the bases and skipping deletions
+ */
+
+ private static byte [] convertVariableGivenBases (List bases, List variable) {
+ byte [] variableArray = new byte[getReadLengthWithNoDeletions(bases)];
+ int i = 0;
+ Iterator variableIterator = variable.iterator();
+ for (BaseIndex baseIndex : bases) {
+ byte count = variableIterator.next();
+ if (baseIndex != BaseIndex.D)
+ variableArray[i++] = count;
+ }
+ return variableArray;
+
+ }
+
+ /**
+ * Shared functionality for all conversion utilities
+ *
+ * @param bases the read bases
+ * @return the length of the read with no deletions
+ */
+ private static int getReadLengthWithNoDeletions(List bases) {
+ int readLength = bases.size();
+ for (BaseIndex baseIndex : bases)
+ if (baseIndex == BaseIndex.D)
+ readLength--;
+ return readLength;
+ }
+
+
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
new file mode 100644
index 000000000..f91e535b0
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
@@ -0,0 +1,242 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import com.google.java.contract.Requires;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.HashMap;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 7/21/11
+ * Time: 2:21 PM
+ *
+ * This is a site based implementation of an Error Model. The error model is a probability
+ * distribution for the site given the phred scaled quality.
+ */
+public class ErrorModel {
+ private byte maxQualityScore;
+ private byte minQualityScore;
+ private byte phredScaledPrior;
+ private double log10minPower;
+ private int refDepth;
+ private boolean hasData = false;
+ private ProbabilityVector probabilityVector;
+ private static final boolean compressRange = false;
+
+ private static final double log10MinusE = Math.log10(Math.exp(1.0));
+
+ /**
+ * Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
+ *
+ * @param minQualityScore Minimum site quality score to evaluate
+ * @param maxQualityScore Maximum site quality score to evaluate
+ * @param phredScaledPrior Prior for site quality
+ * @param refSamplePileup Reference sample pileup
+ * @param refSampleVC VC with True alleles in reference sample pileup
+ * @param minPower Minimum power
+ */
+ public ErrorModel (byte minQualityScore, byte maxQualityScore, byte phredScaledPrior,
+ ReadBackedPileup refSamplePileup, VariantContext refSampleVC, double minPower) {
+ this.maxQualityScore = maxQualityScore;
+ this.minQualityScore = minQualityScore;
+ this.phredScaledPrior = phredScaledPrior;
+ log10minPower = Math.log10(minPower);
+
+
+ double[] model = new double[maxQualityScore+1];
+ Arrays.fill(model,Double.NEGATIVE_INFINITY);
+
+ boolean hasCalledAlleles = false;
+ if (refSampleVC != null) {
+
+ for (Allele allele : refSampleVC.getAlleles()) {
+ if (allele.isCalled()) {
+ hasCalledAlleles = true;
+ break;
+ }
+ }
+
+
+ }
+ if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
+ double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
+ for (byte q=minQualityScore; q<=maxQualityScore; q++) {
+ // maximum uncertainty if there's no ref data at site
+ model[q] = p;
+ }
+ this.refDepth = 0;
+ }
+ else {
+ hasData = true;
+ int matches = 0;
+ int coverage = refSamplePileup.getNumberOfElements();
+
+ Allele refAllele = refSampleVC.getReference();
+
+ for (PileupElement refPileupElement : refSamplePileup) {
+ boolean isMatch = false;
+ for (Allele allele : refSampleVC.getAlleles())
+ isMatch |= pileupElementMatches(refPileupElement, allele, refAllele);
+
+ matches += (isMatch?1:0);
+ // System.out.format("MATCH:%b\n",isMatch);
+ }
+
+ int mismatches = coverage - matches;
+ //System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
+ for (byte q=minQualityScore; q<=maxQualityScore; q++) {
+ model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
+ }
+ this.refDepth = coverage;
+ }
+
+ // compress probability vector
+ this.probabilityVector = new ProbabilityVector(model, compressRange);
+ }
+
+
+ /**
+ * Simple constructor that just takes a given log-probability vector as error model.
+ * Only intended for unit testing, not general usage.
+ * @param pvector Given vector of log-probabilities
+ *
+ */
+ public ErrorModel(double[] pvector) {
+ this.maxQualityScore = (byte)(pvector.length-1);
+ this.minQualityScore = 0;
+ this.probabilityVector = new ProbabilityVector(pvector, compressRange);
+ this.hasData = true;
+
+ }
+
+ public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele) {
+ /* System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
+ pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
+ pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString());
+ */
+
+ // if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
+ if (allele.isReference()) {
+ // for a ref allele, any base mismatch or new indel is a mismatch.
+ if(allele.getBases().length>0 && allele.getBases().length == refAllele.getBases().length ) // SNP/MNP case
+ return (/*!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart() &&*/ pileupElement.getBase() == allele.getBases()[0]);
+ else
+ // either null allele to compare, or ref/alt lengths are different (indel by definition).
+ // if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
+ return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
+ }
+
+ if (refAllele.getBases().length == allele.getBases().length)
+ // alleles have the same length (eg snp or mnp)
+ return pileupElement.getBase() == allele.getBases()[0];
+
+ // for non-ref alleles,
+ byte[] alleleBases = allele.getBases();
+ int eventLength = alleleBases.length - refAllele.getBases().length;
+ if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
+ return true;
+
+ if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
+ Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases))
+ return true;
+
+ return false;
+ }
+
+
+ /**
+ * What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches,
+ * and assuming each match is independent of each other and that the match probability is just dependent of
+ * the site quality, so p = 10.^-q/10.
+ * Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high),
+ * to avoid underflows we'll use the Poisson approximation with lambda = N*p.
+ * Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above.
+ * @param q Desired q to get likelihood from
+ * @param coverage Total coverage
+ * @param mismatches Number of mismatches
+ * @return Likelihood of observations as a function of q
+ */
+ @Requires({
+ "q >= minQualityScore",
+ "q <= maxQualityScore",
+ "coverage >= 0",
+ "mismatches >= 0",
+ "mismatches <= coverage"
+ })
+ private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) {
+ // same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows
+ double lambda = MathUtils.phredScaleToProbability(q) * (double )coverage;
+ // log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k)
+ return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches);
+ }
+
+ @Requires({"qual-minQualityScore <= maxQualityScore"})
+ public double getSiteLogErrorProbabilityGivenQual (int qual) {
+ return probabilityVector.getLogProbabilityForIndex(qual);
+ }
+
+ public byte getMaxQualityScore() {
+ return maxQualityScore;
+ }
+
+ public byte getMinQualityScore() {
+ return minQualityScore;
+ }
+
+ public int getMinSignificantQualityScore() {
+ return new ProbabilityVector(probabilityVector,true).getMinVal();
+ }
+
+ public int getMaxSignificantQualityScore() {
+ return new ProbabilityVector(probabilityVector,true).getMaxVal();
+ }
+
+ public int getReferenceDepth() {
+ return refDepth;
+ }
+ public boolean hasData() {
+ return hasData;
+ }
+
+ public ProbabilityVector getErrorModelVector() {
+ return probabilityVector;
+ }
+
+ public String toString() {
+ String result = "(";
+ boolean skipComma = true;
+ for (double v : probabilityVector.getProbabilityVector()) {
+ if (skipComma) {
+ skipComma = false;
+ }
+ else {
+ result += ",";
+ }
+ result += String.format("%.4f", v);
+ }
+ return result + ")";
+ }
+
+ public static int getTotalReferenceDepth(HashMap perLaneErrorModels) {
+ int n=0;
+ for (ErrorModel e : perLaneErrorModels.values()) {
+ n += e.getReferenceDepth();
+ }
+ return n;
+ }
+
+ /*
+@Requires({"maxAlleleCount >= 0"})
+//todo -- memoize this function
+ public boolean hasPowerForMaxAC (int maxAlleleCount) {
+ int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount));
+ double log10CumSum = getCumulativeSum(siteQ);
+ return log10CumSum < log10minPower;
+ } */
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java
new file mode 100644
index 000000000..b8be24cad
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java
@@ -0,0 +1,706 @@
+/*
+ * Copyright (c) 2010.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.variantcontext.*;
+
+import java.io.PrintStream;
+import java.util.*;
+
+public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
+ static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
+ final protected UnifiedArgumentCollection UAC;
+
+ private final int ploidy;
+ private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
+ private final static boolean VERBOSE = false;
+
+ protected PoolAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
+ super(UAC, N, logger, verboseWriter);
+ ploidy = UAC.samplePloidy;
+ this.UAC = UAC;
+
+ }
+
+ public List getLog10PNonRef(final VariantContext vc,
+ final double[] log10AlleleFrequencyPriors,
+ final AlleleFrequencyCalculationResult result) {
+
+ GenotypesContext GLs = vc.getGenotypes();
+ List alleles = vc.getAlleles();
+
+ // don't try to genotype too many alternate alleles
+ if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
+ logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
+
+ alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
+ alleles.add(vc.getReference());
+ alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
+
+
+ GLs = subsetAlleles(vc, alleles, false, ploidy);
+ }
+
+ combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
+
+ return alleles;
+ }
+
+
+ /**
+ * Simple wrapper class to hold values of combined pool likelihoods.
+ * For fast hashing and fast retrieval, there's a hash map that shadows main list.
+ *
+ */
+ static class CombinedPoolLikelihoods {
+ private LinkedList alleleCountSetList;
+ private HashMap conformationMap;
+ private double maxLikelihood;
+
+
+ public CombinedPoolLikelihoods() {
+ // final int numElements = GenotypeLikelihoods.numLikelihoods();
+ alleleCountSetList = new LinkedList();
+ conformationMap = new HashMap();
+ maxLikelihood = Double.NEGATIVE_INFINITY;
+ }
+
+ public void add(ExactACset set) {
+ alleleCountSetList.add(set);
+ conformationMap.put(set.ACcounts, set);
+ final double likelihood = set.log10Likelihoods[0];
+
+ if (likelihood > maxLikelihood )
+ maxLikelihood = likelihood;
+
+ }
+
+ public boolean hasConformation(int[] ac) {
+ return conformationMap.containsKey(new ExactACcounts(ac));
+
+ }
+
+ public double getLikelihoodOfConformation(int[] ac) {
+ return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
+ }
+
+ public double getGLOfACZero() {
+ return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
+ }
+
+ public int getLength() {
+ return alleleCountSetList.size();
+ }
+ }
+
+ /**
+ *
+ * Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles
+ * @param vc Input variant context
+ * @param numAllelesToChoose Number of alleles to choose
+ * @param ploidy Ploidy per pool
+ * @return list of numAllelesToChoose most likely alleles
+ */
+
+ private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
+ final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
+ final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
+ for ( int i = 0; i < numOriginalAltAlleles; i++ )
+ likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
+
+ // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
+ final ArrayList GLs = getGLs(vc.getGenotypes());
+ for ( final double[] likelihoods : GLs ) {
+
+ final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
+ final int[] acCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(1+numOriginalAltAlleles,ploidy,PLindexOfBestGL);
+ // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
+ for (int k=1; k < acCount.length;k++) {
+ if (acCount[k] > 0)
+ likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL];
+
+ }
+ }
+
+ // sort them by probability mass and choose the best ones
+ Collections.sort(Arrays.asList(likelihoodSums));
+ final ArrayList bestAlleles = new ArrayList(numAllelesToChoose);
+ for ( int i = 0; i < numAllelesToChoose; i++ )
+ bestAlleles.add(likelihoodSums[i].allele);
+
+ final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose);
+ for ( Allele allele : vc.getAlternateAlleles() ) {
+ if ( bestAlleles.contains(allele) )
+ orderedBestAlleles.add(allele);
+ }
+
+ return orderedBestAlleles;
+ }
+
+
+ /**
+ * Simple non-optimized version that combines GLs from several pools and produces global AF distribution.
+ * @param GLs Inputs genotypes context with per-pool GLs
+ * @param numAlleles Number of alternate alleles
+ * @param ploidyPerPool Number of samples per pool
+ * @param log10AlleleFrequencyPriors Frequency priors
+ * @param result object to fill with output values
+ */
+ protected static void combineSinglePools(final GenotypesContext GLs,
+ final int numAlleles,
+ final int ploidyPerPool,
+ final double[] log10AlleleFrequencyPriors,
+ final AlleleFrequencyCalculationResult result) {
+
+ final ArrayList genotypeLikelihoods = getGLs(GLs);
+
+
+ int combinedPloidy = 0;
+
+ // Combine each pool incrementally - likelihoods will be renormalized at each step
+ CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods();
+
+ // first element: zero ploidy, e.g. trivial degenerate distribution
+ final int[] zeroCounts = new int[numAlleles];
+ final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
+ set.log10Likelihoods[0] = 0.0;
+
+ combinedPoolLikelihoods.add(set);
+ for (int p=1; p ACqueue = new LinkedList();
+ // mapping of ExactACset indexes to the objects
+ final HashMap indexesToACset = new HashMap();
+ final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods();
+
+ // add AC=0 to the queue
+ final int[] zeroCounts = new int[numAlleles];
+ final int newPloidy = originalPloidy + newGLPloidy;
+ zeroCounts[0] = newPloidy;
+
+ ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
+
+ ACqueue.add(zeroSet);
+ indexesToACset.put(zeroSet.ACcounts, zeroSet);
+
+ // keep processing while we have AC conformations that need to be calculated
+ double maxLog10L = Double.NEGATIVE_INFINITY;
+ while ( !ACqueue.isEmpty() ) {
+ // compute log10Likelihoods
+ final ExactACset ACset = ACqueue.remove();
+ final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset);
+ maxLog10L = Math.max(maxLog10L, log10LofKs);
+ // clean up memory
+ indexesToACset.remove(ACset.ACcounts);
+ if ( VERBOSE )
+ System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
+
+ }
+ return newPool;
+ }
+
+ // todo - refactor, function almost identical except for log10LofK computation in PoolGenotypeLikelihoods
+ /**
+ *
+ * @param set ExactACset holding conformation to be computed
+ * @param newPool New pool likelihood holder
+ * @param originalPool Original likelihood holder
+ * @param newGL New pool GL vector to combine
+ * @param log10AlleleFrequencyPriors Prior object
+ * @param originalPloidy Total ploidy of original combined pool
+ * @param newGLPloidy Ploidy of GL vector
+ * @param result AFResult object
+ * @param maxLog10L max likelihood observed so far
+ * @param ACqueue Queue of conformations to compute
+ * @param indexesToACset AC indices of objects in queue
+ * @return max log likelihood
+ */
+ private static double calculateACConformationAndUpdateQueue(final ExactACset set,
+ final CombinedPoolLikelihoods newPool,
+ final CombinedPoolLikelihoods originalPool,
+ final double[] newGL,
+ final double[] log10AlleleFrequencyPriors,
+ final int originalPloidy,
+ final int newGLPloidy,
+ final AlleleFrequencyCalculationResult result,
+ final double maxLog10L,
+ final LinkedList ACqueue,
+ final HashMap indexesToACset) {
+
+ // compute likeihood in "set" of new set based on original likelihoods
+ final int numAlleles = set.ACcounts.counts.length;
+ final int newPloidy = set.getACsum();
+ final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
+
+
+ // add to new pool
+ if (!Double.isInfinite(log10LofK))
+ newPool.add(set);
+
+ if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
+ if ( VERBOSE )
+ System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
+ return log10LofK;
+ }
+
+ // iterate over higher frequencies if possible
+ // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
+ // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
+ final int ACwiggle = set.ACcounts.counts[0];
+ if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
+ return log10LofK;
+
+
+ // add conformations for other cases
+ for ( int allele = 1; allele < numAlleles; allele++ ) {
+ final int[] ACcountsClone = set.ACcounts.getCounts().clone();
+ ACcountsClone[allele]++;
+ // is this a valid conformation?
+ int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
+ ACcountsClone[0] = newPloidy - altSum;
+ if (ACcountsClone[0] < 0)
+ continue;
+
+
+ PoolGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset);
+ }
+
+
+ return log10LofK;
+ }
+
+
+ /**
+ * Naive combiner of two multiallelic pools - number of alt alleles must be the same.
+ * Math is generalization of biallelic combiner.
+ *
+ * For vector K representing an allele count conformation,
+ * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
+ * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
+ * @param originalPool First log-likelihood pool GL vector
+ * @param yy Second pool GL vector
+ * @param ploidy1 Ploidy of first pool (# of chromosomes in it)
+ * @param ploidy2 Ploidy of second pool
+ * @param numAlleles Number of alleles
+ * @param log10AlleleFrequencyPriors Array of biallelic priors
+ * @param result Af calculation result object
+ */
+ public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
+ final double[] log10AlleleFrequencyPriors,
+ final AlleleFrequencyCalculationResult result) {
+/*
+ final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
+ final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
+
+ if (dim1 != originalPool.getLength() || dim2 != yy.length)
+ throw new ReviewedStingException("BUG: Inconsistent vector length");
+
+ if (ploidy2 == 0)
+ return;
+
+ final int newPloidy = ploidy1 + ploidy2;
+
+ // Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
+ // and L2(K) = Pr(D|AC2=K) * choose(m2,K)
+ PoolGenotypeLikelihoods.SumIterator firstIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
+ final double[] x = originalPool.getLikelihoodsAsVector(true);
+ while(firstIterator.hasNext()) {
+ x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
+ firstIterator.next();
+ }
+
+ PoolGenotypeLikelihoods.SumIterator secondIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
+ final double[] y = yy.clone();
+ while(secondIterator.hasNext()) {
+ y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
+ secondIterator.next();
+ }
+
+ // initialize output to -log10(choose(m1+m2,[k1 k2...])
+ final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
+ final PoolGenotypeLikelihoods.SumIterator outputIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
+
+
+ // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
+ while(outputIterator.hasNext()) {
+ final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
+ double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
+
+ originalPool.add(likelihood, set, outputIterator.getLinearIndex());
+ outputIterator.next();
+ }
+*/
+ }
+
+ /**
+ * Compute likelihood of a particular AC conformation and update AFresult object
+ * @param set Set of AC counts to compute
+ * @param firstGLs Original pool likelihoods before combining
+ * @param secondGL New GL vector with additional pool
+ * @param log10AlleleFrequencyPriors Allele frequency priors
+ * @param numAlleles Number of alleles (including ref)
+ * @param ploidy1 Ploidy of original pool (combined)
+ * @param ploidy2 Ploidy of new pool
+ * @param result AFResult object
+ * @return log-likehood of requested conformation
+ */
+ private static double computeLofK(final ExactACset set,
+ final CombinedPoolLikelihoods firstGLs,
+ final double[] secondGL,
+ final double[] log10AlleleFrequencyPriors,
+ final int numAlleles, final int ploidy1, final int ploidy2,
+ final AlleleFrequencyCalculationResult result) {
+
+ final int newPloidy = ploidy1 + ploidy2;
+
+ // sanity check
+ int totalAltK = set.getACsum();
+ if (newPloidy != totalAltK)
+ throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
+
+ totalAltK -= set.ACcounts.counts[0];
+ // totalAltK has sum of alt alleles of conformation now
+
+
+ // special case for k = 0 over all k
+ if ( totalAltK == 0 ) { // all-ref case
+ final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
+ set.log10Likelihoods[0] = log10Lof0;
+
+ result.setLog10LikelihoodOfAFzero(log10Lof0);
+ result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
+
+ } else {
+
+ // initialize result with denominator
+ // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
+ // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
+
+ int[] currentCount = set.ACcounts.getCounts();
+ double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
+
+ // for current conformation, get all possible ways to break vector K into two components G1 and G2
+ final PoolGenotypeLikelihoods.SumIterator innerIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
+ set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
+ while (innerIterator.hasNext()) {
+ // check if breaking current conformation into g1 and g2 is feasible.
+ final int[] acCount2 = innerIterator.getCurrentVector();
+ final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2);
+ final int idx2 = innerIterator.getLinearIndex();
+ // see if conformation is valid and if original pool had this conformation
+ // for conformation to be valid, all elements of g2 have to be <= elements of current AC set
+ if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) {
+ final double gl2 = secondGL[idx2];
+ if (!Double.isInfinite(gl2)) {
+ final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1);
+ final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1);
+ final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
+ final double sum = firstGL + gl2 + num1 + num2;
+
+ set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
+ }
+ }
+ innerIterator.next();
+ }
+
+ set.log10Likelihoods[0] += denom;
+ }
+
+ double log10LofK = set.log10Likelihoods[0];
+
+ // update the MLE if necessary
+ final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
+ result.updateMLEifNeeded(log10LofK, altCounts);
+
+ // apply the priors over each alternate allele
+ for (final int ACcount : altCounts ) {
+ if ( ACcount > 0 )
+ log10LofK += log10AlleleFrequencyPriors[ACcount];
+ }
+ result.updateMAPifNeeded(log10LofK, altCounts);
+
+ return log10LofK;
+ }
+
+ /**
+ * Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy?
+ * @param set AC conformation vector
+ * @param ploidy Ploidy of set
+ * @return Valid conformation
+ */
+ private static boolean isValidConformation(final int[] set, final int ploidy) {
+ int sum=0;
+ for (final int ac: set) {
+ if (ac < 0)
+ return false;
+ sum += ac;
+
+ }
+
+ return (sum == ploidy);
+ }
+
+ /**
+ * Combines naively two biallelic pools (of arbitrary size).
+ * For two pools of size m1 and m2, we can compute the combined likelihood as:
+ * Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
+ * @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
+ * @param newPLVector Second GL vector
+ * @param ploidy1 Ploidy of first pool (# of chromosomes in it)
+ * @param ploidy2 Ploidy of second pool
+ * @param log10AlleleFrequencyPriors Array of biallelic priors
+ * @param result Af calculation result object
+ * @return Combined likelihood vector
+ */
+ public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
+ final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
+ final AlleleFrequencyCalculationResult result) {
+
+ final int newPloidy = ploidy1 + ploidy2;
+
+ final double[] combinedLikelihoods = new double[1+newPloidy];
+
+ /** Pre-fill result array and incorporate weights into input vectors
+ * Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
+ * and L2(k) = Pr(D|AC2=k) * choose(m2,k)
+ * equation reduces to
+ * Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
+ * which is just plain convolution of L1 and L2 (with pre-existing vector)
+ */
+
+ // intialize result vector to -infinity
+ Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
+
+ final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
+ for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
+ x[k] = Double.NEGATIVE_INFINITY;
+
+ final double[] y = newPLVector.clone();
+
+
+ final double log10Lof0 = x[0]+y[0];
+ result.setLog10LikelihoodOfAFzero(log10Lof0);
+ result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
+
+ double maxElement = log10Lof0;
+ int maxElementIdx = 0;
+ int[] alleleCounts = new int[1];
+ for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
+ double[] acc = new double[k+1];
+ Arrays.fill(acc,Double.NEGATIVE_INFINITY);
+ double innerMax = Double.NEGATIVE_INFINITY;
+
+ for (int j=0; j <=k; j++) {
+ double x1,y1;
+
+
+ if (k-j>=0 && k-j < y.length)
+ y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
+ else
+ continue;
+
+ if (j < x.length)
+ x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
+ else
+ continue;
+
+ if (Double.isInfinite(x1) || Double.isInfinite(y1))
+ continue;
+ acc[j] = x1 + y1;
+ if (acc[j] > innerMax)
+ innerMax = acc[j];
+ else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
+ break;
+ }
+ combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
+ maxElementIdx = k;
+ double maxDiff = combinedLikelihoods[k] - maxElement;
+ if (maxDiff > 0)
+ maxElement = combinedLikelihoods[k];
+ else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
+ break;
+ }
+
+ alleleCounts[0] = k;
+ result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
+ result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
+
+
+ }
+
+
+ return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
+ }
+
+
+ /**
+ * From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
+ * including updating the PL's, and assign genotypes accordingly
+ * @param vc variant context with alleles and genotype likelihoods
+ * @param allelesToUse alleles to subset
+ * @param assignGenotypes true: assign hard genotypes, false: leave as no-call
+ * @param ploidy number of chromosomes per sample (pool)
+ * @return GenotypesContext with new PLs
+ */
+ public GenotypesContext subsetAlleles(final VariantContext vc,
+ final List allelesToUse,
+ final boolean assignGenotypes,
+ final int ploidy) {
+ // the genotypes with PLs
+ final GenotypesContext oldGTs = vc.getGenotypes();
+ List NO_CALL_ALLELES = new ArrayList(ploidy);
+
+ for (int k=0; k < ploidy; k++)
+ NO_CALL_ALLELES.add(Allele.NO_CALL);
+
+ // samples
+ final List sampleIndices = oldGTs.getSampleNamesOrderedByName();
+
+ // the new genotypes to create
+ final GenotypesContext newGTs = GenotypesContext.create();
+
+ // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
+ final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
+ final int numNewAltAlleles = allelesToUse.size() - 1;
+
+
+ // create the new genotypes
+ for ( int k = 0; k < oldGTs.size(); k++ ) {
+ final Genotype g = oldGTs.get(sampleIndices.get(k));
+ if ( !g.hasLikelihoods() ) {
+ newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
+ continue;
+ }
+
+ // create the new likelihoods array from the alleles we are allowed to use
+ final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
+ double[] newLikelihoods;
+ if ( numOriginalAltAlleles == numNewAltAlleles) {
+ newLikelihoods = originalLikelihoods;
+ } else {
+ newLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(),allelesToUse);
+
+ // might need to re-normalize
+ newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
+ }
+
+ // if there is no mass on the (new) likelihoods, then just no-call the sample
+ if ( MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) {
+ newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
+ }
+ else {
+ final GenotypeBuilder gb = new GenotypeBuilder(g);
+
+ if ( numNewAltAlleles == 0 )
+ gb.noPL();
+ else
+ gb.PL(newLikelihoods);
+
+ // if we weren't asked to assign a genotype, then just no-call the sample
+ if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL )
+ gb.alleles(NO_CALL_ALLELES);
+ else
+ assignGenotype(gb, newLikelihoods, allelesToUse, ploidy);
+ newGTs.add(gb.make());
+ }
+ }
+
+ return newGTs;
+
+ }
+
+ /**
+ * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
+ *
+ * @param newLikelihoods the PL array
+ * @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
+ * @param numChromosomes Number of chromosomes per pool
+ *
+ * @return genotype
+ */
+ private static void assignGenotype(final GenotypeBuilder gb,
+ final double[] newLikelihoods,
+ final List allelesToUse,
+ final int numChromosomes) {
+ final int numNewAltAlleles = allelesToUse.size() - 1;
+
+
+
+ // find the genotype with maximum likelihoods
+ final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
+
+ final int[] mlAlleleCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex);
+ final ArrayList alleleFreqs = new ArrayList();
+ final ArrayList alleleCounts = new ArrayList();
+
+
+ for (int k=1; k < mlAlleleCount.length; k++) {
+ alleleCounts.add(mlAlleleCount[k]);
+ final double freq = (double)mlAlleleCount[k] / (double)numChromosomes;
+ alleleFreqs.add(freq);
+
+ }
+
+ // per-pool logging of AC and AF
+ gb.attribute(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
+ gb.attribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
+
+ // remove PLs if necessary
+ if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
+ gb.noPL();
+
+ ArrayList myAlleles = new ArrayList();
+
+ // add list of called ML genotypes to alleles list
+ // TODO - too unwieldy?
+ int idx = 0;
+ for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) {
+ for (int k=0; k < mlAlleleCount[mlind]; k++)
+ myAlleles.add(idx++,allelesToUse.get(mlind));
+ }
+ gb.alleles(myAlleles);
+
+ if ( numNewAltAlleles > 0 )
+ gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
+ }
+
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java
new file mode 100644
index 000000000..438acbacd
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import net.sf.samtools.SAMUtils;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
+
+import java.util.*;
+
+public abstract class PoolGenotypeLikelihoods {
+ protected final int numChromosomes;
+ private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
+
+ protected static final boolean VERBOSE = false;
+ protected static final double qualVec[] = new double[SAMUtils.MAX_PHRED_SCORE+1];
+
+ //
+ // The fundamental data arrays associated with a Genotype Likelhoods object
+ //
+ protected double[] log10Likelihoods;
+ protected double[][] logMismatchProbabilityArray;
+
+ protected final int nSamplesPerPool;
+ protected final HashMap perLaneErrorModels;
+ protected final int likelihoodDim;
+ protected final boolean ignoreLaneInformation;
+ protected final double LOG10_PLOIDY;
+ protected boolean hasReferenceSampleData;
+
+ protected final int nAlleles;
+ protected final List alleles;
+
+ private static final double MIN_LIKELIHOOD = Double.NEGATIVE_INFINITY;
+
+ private static final int MAX_NUM_ALLELES_TO_CACHE = 20;
+ private static final int MAX_NUM_SAMPLES_PER_POOL = 1000;
+
+ private static final boolean FAST_GL_COMPUTATION = true;
+ // constructor with given logPL elements
+ public PoolGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy,
+ final HashMap perLaneErrorModels, final boolean ignoreLaneInformation) {
+ this.alleles = alleles;
+ this.nAlleles = alleles.size();
+ numChromosomes = ploidy;
+ nSamplesPerPool = numChromosomes/2;
+ this.perLaneErrorModels = perLaneErrorModels;
+ this.ignoreLaneInformation = ignoreLaneInformation;
+
+ // check if at least one lane has actual data
+ if (perLaneErrorModels == null || perLaneErrorModels.isEmpty())
+ hasReferenceSampleData = false;
+ else {
+ for (Map.Entry elt : perLaneErrorModels.entrySet()) {
+ if (elt.getValue().hasData()) {
+ hasReferenceSampleData = true;
+ break;
+ }
+ }
+ }
+ // check sizes
+ if (nAlleles > MAX_NUM_ALLELES_TO_CACHE)
+ throw new UserException("No support for this number of alleles");
+
+ if (nSamplesPerPool > MAX_NUM_SAMPLES_PER_POOL)
+ throw new UserException("No support for such large number of samples per pool");
+
+ likelihoodDim = GenotypeLikelihoods.numLikelihoods(nAlleles, numChromosomes);
+
+ if (logLikelihoods == null){
+ log10Likelihoods = new double[likelihoodDim];
+ Arrays.fill(log10Likelihoods, MIN_LIKELIHOOD);
+ } else {
+ if (logLikelihoods.length != likelihoodDim)
+ throw new ReviewedStingException("BUG: inconsistent parameters when creating PoolGenotypeLikelihoods object");
+
+ log10Likelihoods = logLikelihoods; //.clone(); // is clone needed?
+ }
+ fillCache();
+ LOG10_PLOIDY = Math.log10((double)numChromosomes);
+ }
+
+
+ /**
+ * Crucial inner class that handles addressing elements of pool likelihoods. We store likelihoods as a map
+ * of form int[] -> double (to be more precise, IntArrayWrapper -> Double).
+ * For a given ploidy (chromosome count) and number of alleles, we need a form to iterate deterministically
+ * across all possible allele conformations.
+ * Problem equivalent to listing in determistic order all possible ways in which N integers will sum to P,
+ * where N is number of alleles and P is number of chromosomes.
+ * There's an option to list all integers so that sum will be UP to P.
+ * For example, with P=2,N=2, restrictSumTo = 2 iterator will produce
+ * [2 0 ] [1 1] [ 0 2]
+ *
+ *
+ */
+ protected static class SumIterator {
+ private int[] currentState;
+ private final int[] finalState;
+ private final int restrictSumTo;
+ private final int dim;
+ private boolean hasNext;
+ private int linearIndex;
+ private int currentSum;
+
+ /**
+ * Default constructor. Typical use case: restrictSumTo = -1 if there's no sum restriction, or will generate int[]
+ * vectors so that all add to this value.
+ *
+ * @param finalState End state - typically we should set value to (P,P,P,...)
+ * @param restrictSumTo See above
+ */
+ public SumIterator(final int[] finalState,final int restrictSumTo) {
+ this.finalState = finalState;
+ this.dim = finalState.length;
+ this.restrictSumTo = restrictSumTo;
+ currentState = new int[dim];
+ reset();
+
+ }
+
+ /**
+ * Shortcut constructor for common use case: iterator will produce
+ * all vectors of length numAlleles whose sum = numChromosomes
+ * @param numAlleles Number of alleles
+ * @param numChromosomes Ploidy
+ */
+ public SumIterator(final int numAlleles, final int numChromosomes) {
+ this(getInitialStateVector(numAlleles,numChromosomes), numChromosomes);
+ }
+
+
+ private static int[] getInitialStateVector(final int nAlleles, final int numChromosomes) {
+ int[] initialState = new int[nAlleles];
+ Arrays.fill(initialState,numChromosomes);
+ return initialState;
+ }
+
+ public void setInitialStateVector(final int[] stateVector) {
+ if (restrictSumTo > 0) {
+ // check that desired vector is valid
+ if (MathUtils.sum(stateVector) != restrictSumTo)
+ throw new ReviewedStingException("BUG: initial state vector nor compatible with sum iterator");
+
+ final int numAlleles = currentState.length;
+ final int ploidy = restrictSumTo;
+
+ linearIndex = PoolGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy);
+ }
+ else
+ throw new ReviewedStingException("BUG: Not supported");
+
+ }
+ public void next() {
+ int initialDim = (restrictSumTo > 0)?1:0;
+ hasNext = next(finalState, initialDim);
+ if (hasNext)
+ linearIndex++;
+ }
+
+ private boolean next(final int[] finalState, int initialDim) {
+ boolean hasNextState = false;
+ for (int currentDim=initialDim; currentDim < finalState.length; currentDim++) {
+ final int x = currentState[currentDim]+1;
+
+ if (x > finalState[currentDim] || (currentSum >= restrictSumTo && initialDim > 0)) {
+ // update vector sum, and reset position
+ currentSum -= currentState[currentDim];
+ currentState[currentDim] = 0;
+ if (currentDim >= dim-1) {
+ hasNextState = false;
+ break;
+ }
+ }
+ else {
+ currentState[currentDim] = x;
+ hasNextState = true;
+ currentSum++;
+ break;
+ }
+ }
+ if (initialDim > 0) {
+ currentState[0] = restrictSumTo - currentSum;
+ }
+ return hasNextState;
+ }
+
+ public void reset() {
+ Arrays.fill(currentState, 0);
+ if (restrictSumTo > 0)
+ currentState[0] = restrictSumTo;
+ hasNext = true;
+ linearIndex = 0;
+ currentSum = 0;
+ }
+ public int[] getCurrentVector() {
+ return currentState;
+ }
+
+ public int[] getCurrentAltVector() {
+ return Arrays.copyOfRange(currentState,1,currentState.length);
+ }
+ /* public int getCurrentSum() {
+ return currentSum;
+ }
+ */
+ public int getLinearIndex() {
+ return linearIndex;
+ }
+
+ public boolean hasNext() {
+ return hasNext;
+ }
+ }
+
+ public List getAlleles() { return alleles;}
+
+ /**
+ * Returns an array of log10 likelihoods for each genotype conformation, with ordering determined by SumIterator class.
+ *
+ * @return likelihoods array
+ */
+ public double[] getLikelihoods() {
+ return log10Likelihoods;
+ }
+
+
+
+
+
+ /**
+ * Set particular element of logPL vector
+ * @param idx index of allele count conformation to modify
+ * @param pl Likelihood to associate with map
+ */
+ public void setLogPLs(final int idx, final double pl) {
+ log10Likelihoods[idx] = pl;
+ }
+
+ public void renormalize() {
+ log10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods,false,true);
+ }
+ /** Compute most likely AC conformation based on currently stored PL's - just loop through log PL map and output max value
+ *
+ * @return vector with most likely allele count, ordered according to this object's alleles
+ */
+ public Pair getMostLikelyACCount() {
+
+ int[] mlInd = null;
+ double maxVal = Double.NEGATIVE_INFINITY;
+
+ final SumIterator iterator = new SumIterator(alleles.size(),numChromosomes);
+
+ int idx = 0;
+ while (iterator.hasNext()) {
+ double pl = log10Likelihoods[idx++];
+ if (pl > maxVal) {
+ maxVal = pl;
+ mlInd = iterator.getCurrentVector().clone();
+
+ }
+ iterator.next();
+ }
+ if (VERBOSE) {
+ System.out.println(VCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
+ }
+ return new Pair(mlInd,maxVal);
+ }
+
+ /**
+ * Given set of alleles with corresponding vector of likelihoods, subset to a new set of alleles
+ *
+ * @param oldLikelihoods Vector of PL's corresponding to original alleles
+ * @param numChromosomes Ploidy (number of chromosomes describing PL's)
+ * @param originalAlleles List of original alleles
+ * @param allelesToSubset Alleles to subset
+ * @return Vector of new PL's, ordered accorrding to SumIterator's ordering
+ */
+ public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes,
+ final List originalAlleles, final List allelesToSubset) {
+
+ int newPLSize = PoolGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes);
+ double[] newPLs = new double[newPLSize];
+
+
+ int idx = 0;
+ // First fill boolean array stating whether each original allele is present in new mapping
+ final boolean [] allelePresent = new boolean[originalAlleles.size()];
+ for ( Allele allele : originalAlleles )
+ allelePresent[idx++] = allelesToSubset.contains(allele);
+
+
+ // compute mapping from old idx to new idx
+ // This might be needed in case new allele set is not ordered in the same way as old set
+ // Example. Original alleles: {T*,C,G,A}. New alleles: {G,C}. Permutation key = [2,1]
+
+ int[] permutationKey = new int[allelesToSubset.size()];
+ for (int k=0; k < allelesToSubset.size(); k++)
+ // for each allele to subset, find corresponding index in original allele list
+ permutationKey[k] = originalAlleles.indexOf(allelesToSubset.get(k));
+
+
+ if (VERBOSE) {
+ System.out.println("permutationKey:"+Arrays.toString(permutationKey));
+ }
+
+ final SumIterator iterator = new SumIterator(originalAlleles.size(),numChromosomes);
+
+ while (iterator.hasNext()) {
+ // for each entry in logPL table, associated originally with allele count stored in vec[],
+ // see if this allele count conformation will be present in new logPL table.
+ // For entry to be present, elements in dimensions not present in requested allele list have to have count = 0
+ int[] pVec = iterator.getCurrentVector();
+ double pl = oldLikelihoods[iterator.getLinearIndex()];
+
+ boolean keyPresent = true;
+ for (int k=0; k < allelePresent.length; k++)
+ if ( pVec[k]>0 && !allelePresent[k] )
+ keyPresent = false;
+
+ if (keyPresent) {// skip to next entry in logPLs if this conformation is not present in subset
+
+ final int[] newCount = new int[allelesToSubset.size()];
+
+ // map from old allele mapping count to new allele mapping
+ // In pseudo-Matlab notation: newCount = vec[permutationKey] for permutationKey vector
+ for (idx = 0; idx < newCount.length; idx++)
+ newCount[idx] = pVec[permutationKey[idx]];
+
+ // get corresponding index from new count
+ int outputIdx = PoolGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes);
+ newPLs[outputIdx] = pl;
+ if (VERBOSE) {
+ System.out.println("Old Key:"+Arrays.toString(pVec));
+ System.out.println("New Key:"+Arrays.toString(newCount));
+ }
+ }
+ iterator.next();
+ }
+
+ return newPLs;
+ }
+
+ public static int getLinearIndex(int[] vectorIdx, int numAlleles, int ploidy) {
+
+ if (ploidy <= 0)
+ return 0;
+
+ int linearIdx = 0;
+ int cumSum = ploidy;
+ for (int k=numAlleles-1;k>=1; k--) {
+ int idx = vectorIdx[k];
+ // how many blocks are before current position
+ if (idx == 0)
+ continue;
+ for (int p=0; p < idx; p++)
+ linearIdx += getNumLikelihoodElements( k, cumSum-p);
+
+ cumSum -= idx;
+ }
+
+ return linearIdx;
+
+ }
+
+ /**
+ * Given a scalar index, what's the alelle count conformation corresponding to it?
+ * @param nAlleles Number of alleles
+ * @param numChromosomes Ploidy
+ * @param PLindex Index to query
+ * @return Allele count conformation, according to iteration order from SumIterator
+ */
+ public static int[] getAlleleCountFromPLIndex(final int nAlleles, final int numChromosomes, final int PLindex) {
+
+ // todo - another brain-dead inefficient implementation, can do much better by computing in closed form
+ final SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
+ while (iterator.hasNext()) {
+ final int[] plVec = iterator.getCurrentVector();
+ if (iterator.getLinearIndex() == PLindex)
+ return plVec;
+
+ iterator.next();
+ }
+
+ return null;
+
+ }
+
+ /*
+ * a cache of the PL ivector sizes as a function of # of alleles and pool sizes
+ */
+
+ public static int getNumLikelihoodElements(int numAlleles, int ploidy) {
+ return GenotypeLikelihoodVectorSizes[numAlleles][ploidy];
+ }
+
+ private final static int[][] GenotypeLikelihoodVectorSizes = fillGLVectorSizeCache(MAX_NUM_ALLELES_TO_CACHE, 2*MAX_NUM_SAMPLES_PER_POOL);
+
+ private static int[][] fillGLVectorSizeCache(int maxAlleles, int maxPloidy) {
+
+ int[][] cache = new int[maxAlleles][maxPloidy];
+ for (int numAlleles=1; numAlleles < maxAlleles; numAlleles++) {
+ for (int ploidy=0; ploidy < maxPloidy; ploidy++) {
+
+ if (numAlleles == 1)
+ cache[numAlleles][ploidy] = 1;
+ else if (ploidy == 1)
+ cache[numAlleles][ploidy] = numAlleles;
+ else {
+ int acc =0;
+ for (int k=0; k <= ploidy; k++ )
+ acc += cache[numAlleles-1][ploidy-k];
+
+ cache[numAlleles][ploidy] = acc;
+ }
+ }
+ }
+ return cache;
+ }
+
+ /**
+ * Return a string representation of this object in a moderately usable form
+ *
+ * @return string representation
+ */
+ public String toString() {
+ StringBuilder s = new StringBuilder(1000);
+
+ s.append("Alleles:");
+ for (Allele a: this.alleles){
+ s.append(a.getDisplayString());
+ s.append(",");
+ }
+ s.append("\nGLs:\n");
+ SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
+ while (iterator.hasNext()) {
+ if (!Double.isInfinite(getLikelihoods()[iterator.getLinearIndex()])) {
+
+ s.append("Count [");
+ StringBuilder b = new StringBuilder(iterator.getCurrentVector().length*2);
+ for (int it:iterator.getCurrentVector()) {
+ b.append(it);
+ b.append(",");
+ }
+ s.append(b.toString());
+ s.append(String.format("] GL=%4.3f\n",this.getLikelihoods()[iterator.getLinearIndex()]) );
+ }
+ iterator.next();
+ }
+ return s.toString();
+ }
+
+
+ public void computeLikelihoods(ErrorModel errorModel,
+ List alleleList, List numObservations, ReadBackedPileup pileup) {
+
+ if (FAST_GL_COMPUTATION) {
+ // queue up elements to be computed. Assumptions:
+ // GLs distributions are unimodal
+ // GLs are continuous
+ // Hence, once an AC conformation is computed, we queue up its immediate topological neighbors.
+ // If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
+ // and we repeat until queue is empty
+ // queue of AC conformations to process
+ final LinkedList ACqueue = new LinkedList();
+ // mapping of ExactACset indexes to the objects
+ final HashMap indexesToACset = new HashMap(likelihoodDim);
+ // add AC=0 to the queue
+ final int[] zeroCounts = new int[nAlleles];
+ zeroCounts[0] = numChromosomes;
+
+ AlleleFrequencyCalculationModel.ExactACset zeroSet =
+ new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
+
+ ACqueue.add(zeroSet);
+ indexesToACset.put(zeroSet.ACcounts, zeroSet);
+
+ // keep processing while we have AC conformations that need to be calculated
+ double maxLog10L = Double.NEGATIVE_INFINITY;
+ while ( !ACqueue.isEmpty() ) {
+ // compute log10Likelihoods
+ final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
+ final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
+
+ // adjust max likelihood seen if needed
+ maxLog10L = Math.max(maxLog10L, log10LofKs);
+ // clean up memory
+ indexesToACset.remove(ACset.ACcounts);
+ if ( VERBOSE )
+ System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
+
+ }
+
+
+ } else {
+ int plIdx = 0;
+ SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
+ while (iterator.hasNext()) {
+ AlleleFrequencyCalculationModel.ExactACset ACset =
+ new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
+ // for observed base X, add Q(jX,k) to likelihood vector for all k in error model
+ //likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
+ getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
+
+ setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
+ iterator.next();
+ }
+ }
+ // normalize PL's
+ renormalize();
+
+ }
+
+ private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
+ final ErrorModel errorModel,
+ final List alleleList,
+ final List numObservations,
+ final double maxLog10L,
+ final LinkedList ACqueue,
+ final HashMap indexesToACset,
+ final ReadBackedPileup pileup) {
+ // compute likelihood of set
+ getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
+ final double log10LofK = set.log10Likelihoods[0];
+
+ // log result in PL vector
+ int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
+ setLogPLs(idx, log10LofK);
+
+ // can we abort early because the log10Likelihoods are so small?
+ if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
+ if ( VERBOSE )
+ System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
+ return log10LofK;
+ }
+
+ // iterate over higher frequencies if possible
+ // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
+ final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
+ if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
+ return log10LofK;
+
+
+ // add conformations for other cases
+ for ( int allele = 1; allele < nAlleles; allele++ ) {
+ final int[] ACcountsClone = set.ACcounts.getCounts().clone();
+ ACcountsClone[allele]++;
+ // is this a valid conformation?
+ int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
+ ACcountsClone[0] = numChromosomes - altSum;
+ if (ACcountsClone[0] < 0)
+ continue;
+
+
+ updateACset(ACcountsClone, ACqueue, indexesToACset);
+ }
+ return log10LofK;
+
+ }
+
+ /**
+ * Abstract methods, must be implemented in subclasses
+ *
+ * @param ACset Count to compute
+ * @param errorModel Site-specific error model object
+ * @param alleleList List of alleles
+ * @param numObservations Number of observations for each allele
+ * @param pileup Read backed pileup in case it's necessary
+ */
+ public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
+ final ErrorModel errorModel,
+ final List alleleList,
+ final List numObservations,
+ final ReadBackedPileup pileup);
+
+
+ public abstract int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC);
+
+ // Static methods
+ public static void updateACset(final int[] newSetCounts,
+ final LinkedList ACqueue,
+ final HashMap indexesToACset) {
+
+ final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
+ if ( !indexesToACset.containsKey(index) ) {
+ AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
+ indexesToACset.put(index, newSet);
+ ACqueue.add(newSet);
+ if (VERBOSE)
+ System.out.println(" *** Adding set to queue:" + index.toString());
+ }
+
+ }
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ //
+ // helper routines
+ //
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+
+
+ //
+ // Constant static data
+ //
+
+ static {
+ // cache 10^(-k/10)
+ for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++)
+ qualVec[j] = Math.pow(10.0,-(double)j/10.0);
+ }
+
+ private void fillCache() {
+ // cache Q(j,k) = log10(j/2N*(1-ek) + (2N-j)/2N*ek) for j = 0:2N
+
+ logMismatchProbabilityArray = new double[1+numChromosomes][1+SAMUtils.MAX_PHRED_SCORE];
+ for (int i=0; i <= numChromosomes; i++) {
+ for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
+ double phi = (double)i/numChromosomes;
+ logMismatchProbabilityArray[i][j] = Math.log10(phi * (1.0-qualVec[j]) + qualVec[j]/3.0 * (1.0-phi));
+ }
+ }
+ }
+
+}
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java
new file mode 100644
index 000000000..37b676601
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2010, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.variantcontext.*;
+
+import java.util.*;
+
+public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
+
+ //protected Set laneIDs;
+ public enum Model {
+ SNP,
+ INDEL,
+ POOLSNP,
+ POOLINDEL,
+ BOTH
+ }
+
+ final protected UnifiedArgumentCollection UAC;
+
+ protected PoolGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
+ super(UAC,logger);
+ this.UAC = UAC;
+
+ }
+
+
+ /*
+ Get vc with alleles from reference sample. Can be null if there's no ref sample call or no ref sample coverage at this site.
+ */
+ protected VariantContext getTrueAlleles(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ Map contexts) {
+ // Get reference base from VCF or Reference
+ if (UAC.referenceSampleName == null)
+ return null;
+
+ AlignmentContext context = contexts.get(UAC.referenceSampleName);
+ ArrayList trueReferenceAlleles = new ArrayList();
+
+ VariantContext referenceSampleVC;
+
+ if (tracker != null && context != null)
+ referenceSampleVC = tracker.getFirstValue(UAC.referenceSampleRod, context.getLocation());
+ else
+ return null;
+
+ if (referenceSampleVC == null) {
+ trueReferenceAlleles.add(Allele.create(ref.getBase(),true));
+ return new VariantContextBuilder("pc",ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop(),trueReferenceAlleles).make();
+
+ }
+ else {
+ Genotype referenceGenotype = referenceSampleVC.getGenotype(UAC.referenceSampleName);
+ List referenceAlleles = referenceGenotype.getAlleles();
+
+ return new VariantContextBuilder("pc",referenceSampleVC.getChr(), referenceSampleVC.getStart(), referenceSampleVC.getEnd(),
+ referenceSampleVC.getAlleles())
+ .referenceBaseForIndel(referenceSampleVC.getReferenceBaseForIndel())
+ .genotypes(new GenotypeBuilder(UAC.referenceSampleName, referenceAlleles).GQ(referenceGenotype.getGQ()).make())
+ .make();
+ }
+ }
+
+
+ /**
+ * GATK Engine creates readgroups of the form XXX.Y.Z
+ * XXX.Y is the unique lane identifier.
+ * Z is the id of the sample to make the read group id unique
+ * This function returns the list of lane identifiers.
+ *
+ * @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
+ * @return a collection of lane ids.
+ */
+ public static Set parseLaneIDs(Collection readGroups) {
+ HashSet result = new HashSet();
+ for (String readGroup : readGroups) {
+ result.add(getLaneIDFromReadGroupString(readGroup));
+ }
+ return result;
+ }
+
+ /**
+ * GATK Engine creates readgroups of the form XXX.Y.Z
+ * XXX.Y is the unique lane identifier.
+ * Z is the id of the sample to make the read group id unique
+ *
+ * @param readGroupID the read group id string
+ * @return just the lane id (the XXX.Y string)
+ */
+ public static String getLaneIDFromReadGroupString(String readGroupID) {
+ // System.out.println(readGroupID);
+ String [] parsedID = readGroupID.split("\\.");
+ if (parsedID.length > 1)
+ return parsedID[0] + "." + parsedID[1];
+ else
+ return parsedID[0] + ".0";
+ }
+
+
+ /** Wrapper class that encapsulates likelihood object and sample name
+ *
+ */
+ protected static class PoolGenotypeData {
+
+ public final String name;
+ public final PoolGenotypeLikelihoods GL;
+ public final int depth;
+ public final List alleles;
+
+ public PoolGenotypeData(final String name, final PoolGenotypeLikelihoods GL, final int depth, final List alleles) {
+ this.name = name;
+ this.GL = GL;
+ this.depth = depth;
+ this.alleles = alleles;
+ }
+ }
+
+ // determines the alleles to use
+ protected List determineAlternateAlleles(final List sampleDataList) {
+
+ if (sampleDataList.isEmpty())
+ return Collections.emptyList();
+
+ final int REFERENCE_IDX = 0;
+ final List allAlleles = sampleDataList.get(0).GL.getAlleles();
+ double[] likelihoodSums = new double[allAlleles.size()];
+
+ // based on the GLs, find the alternate alleles with enough probability
+ for ( PoolGenotypeData sampleData : sampleDataList ) {
+ final Pair mlACPair = sampleData.GL.getMostLikelyACCount();
+ final double topLogGL = mlACPair.second;
+
+ if (sampleData.GL.getAlleles().size() != allAlleles.size())
+ throw new ReviewedStingException("BUG: inconsistent size of alleles!");
+
+ // ref allele is always first in array list
+ if (sampleData.GL.alleles.get(0).isNonReference())
+ throw new ReviewedStingException("BUG: first allele in list is not reference!");
+
+ double refGL = sampleData.GL.getLikelihoods()[REFERENCE_IDX];
+
+ // check if maximum likelihood AC is all-ref for current pool. If so, skip
+ if (mlACPair.first[REFERENCE_IDX] == sampleData.GL.numChromosomes)
+ continue;
+
+ // most likely AC is not all-ref: for all non-ref alleles, add difference of max likelihood and all-ref likelihood
+ for (int i=0; i < mlACPair.first.length; i++) {
+ if (i==REFERENCE_IDX) continue;
+
+ if (mlACPair.first[i] > 0)
+ likelihoodSums[i] += topLogGL - refGL;
+
+ }
+ }
+
+ final List allelesToUse = new ArrayList();
+ for ( int i = 0; i < likelihoodSums.length; i++ ) {
+ if ( likelihoodSums[i] > 0.0 )
+ allelesToUse.add(allAlleles.get(i));
+ }
+
+ return allelesToUse;
+ }
+
+
+ public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ Map contexts,
+ final AlignmentContextUtils.ReadOrientation contextType,
+ final List allAllelesToUse,
+ final boolean useBAQedPileup,
+ final GenomeLocParser locParser) {
+
+ HashMap perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts);
+ if (perLaneErrorModels == null && UAC.referenceSampleName != null)
+ return null;
+
+ if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) {
+ AlignmentContext mergedContext = AlignmentContextUtils.joinContexts(contexts.values());
+ Map newContext = new HashMap();
+ newContext.put(DUMMY_SAMPLE_NAME,mergedContext);
+ contexts = newContext;
+ }
+
+ // get initial alleles to genotype
+ final List allAlleles = new ArrayList();
+ if (allAllelesToUse == null || allAllelesToUse.isEmpty())
+ allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse));
+ else
+ allAlleles.addAll(allAllelesToUse);
+
+ if (allAlleles.isEmpty())
+ return null;
+
+ final ArrayList GLs = new ArrayList(contexts.size());
+
+ for ( Map.Entry sample : contexts.entrySet() ) {
+ // skip reference sample
+ if (UAC.referenceSampleName != null && sample.getKey().equals(UAC.referenceSampleName))
+ continue;
+
+ ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
+
+ // create the GenotypeLikelihoods object
+ final PoolGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
+ // actually compute likelihoods
+ final int nGoodBases = GL.add(pileup, UAC);
+ if ( nGoodBases > 0 )
+ // create wrapper object for likelihoods and add to list
+ GLs.add(new PoolGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup), allAlleles));
+ }
+
+ // find the alternate allele(s) that we should be using
+ final List alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
+
+ // start making the VariantContext
+ final GenomeLoc loc = ref.getLocus();
+ final int endLoc = getEndLocation(tracker, ref, alleles);
+
+ final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles);
+ builder.alleles(alleles);
+
+ final HashMap attributes = new HashMap();
+
+ if (UAC.referenceSampleName != null && perLaneErrorModels != null)
+ attributes.put(VCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
+
+ builder.attributes(attributes);
+ // create the genotypes; no-call everyone for now
+ final GenotypesContext genotypes = GenotypesContext.create();
+ final List noCall = new ArrayList();
+ noCall.add(Allele.NO_CALL);
+
+ for ( PoolGenotypeData sampleData : GLs ) {
+ // extract from multidimensional array
+ final double[] myLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(),sampleData.GL.numChromosomes,
+ allAlleles, alleles);
+
+ // normalize in log space so that max element is zero.
+ final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name, noCall);
+ gb.DP(sampleData.depth);
+ gb.PL(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
+ genotypes.add(gb.make());
+ }
+
+ return builder.genotypes(genotypes).make();
+
+ }
+
+
+ protected HashMap getPerLaneErrorModels(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ Map contexts) {
+ VariantContext refVC = getTrueAlleles(tracker, ref, contexts);
+
+
+ // Build error model for site based on reference sample, and keep stratified for each lane.
+ AlignmentContext refContext = null;
+ if (UAC.referenceSampleName != null)
+ refContext = contexts.get(UAC.referenceSampleName);
+
+ ReadBackedPileup refPileup = null;
+ if (refContext != null) {
+ HashMap perLaneErrorModels = new HashMap();
+ refPileup = refContext.getBasePileup();
+
+ Set laneIDs = new TreeSet();
+ if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO)
+ laneIDs.add(DUMMY_LANE);
+ else
+ laneIDs = parseLaneIDs(refPileup.getReadGroups());
+ // build per-lane error model for all lanes present in ref sample
+ for (String laneID : laneIDs) {
+ // get reference pileup for this lane
+ ReadBackedPileup refLanePileup = refPileup;
+ // subset for this lane
+ if (refPileup != null && !(UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO))
+ refLanePileup = refPileup.getPileupForLane(laneID);
+
+ //ReferenceSample referenceSample = new ReferenceSample(UAC.referenceSampleName, refLanePileup, trueReferenceAlleles);
+ perLaneErrorModels.put(laneID, new ErrorModel(UAC.minQualityScore, UAC.maxQualityScore, UAC.phredScaledPrior, refLanePileup, refVC, UAC.minPower));
+ }
+ return perLaneErrorModels;
+
+ }
+ else
+ return null;
+
+ }
+
+ /*
+ Abstract methods - must be implemented in derived classes
+ */
+
+ protected abstract PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles,
+ final double[] logLikelihoods,
+ final int ploidy,
+ final HashMap perLaneErrorModels,
+ final boolean useBQAedPileup,
+ final ReferenceContext ref,
+ final boolean ignoreLaneInformation);
+
+ protected abstract List getInitialAllelesToUse(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ Map contexts,
+ final AlignmentContextUtils.ReadOrientation contextType,
+ final GenomeLocParser locParser,
+ final List allAllelesToUse);
+
+ protected abstract List getFinalAllelesToUse(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ final List allAllelesToUse,
+ final ArrayList GLs);
+
+ protected abstract int getEndLocation(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ final List alternateAllelesToUse);
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java
new file mode 100644
index 000000000..df5f6002b
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java
@@ -0,0 +1,58 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
+import org.broadinstitute.sting.utils.MathUtils;
+
+public class PoolGenotypePriors implements GenotypePriors {
+ private final double[] flatPriors;
+ private final double heterozygosity;
+ private final int samplesPerPool;
+ private double[] priors = null;
+
+ /**
+ * Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
+ */
+ public PoolGenotypePriors(double heterozygosity, int samplesPerPool) {
+ flatPriors = new double[2*samplesPerPool+1];
+ for (int k=0; k haplotypeMap;
+ final ReferenceContext refContext;
+ final int eventLength;
+ double[][] readHaplotypeLikelihoods;
+
+ public PoolIndelGenotypeLikelihoods(final List alleles,
+ final double[] logLikelihoods,
+ final int ploidy,
+ final HashMap perLaneErrorModels,
+ final boolean ignoreLaneInformation,
+ final PairHMMIndelErrorModel pairModel,
+ final LinkedHashMap haplotypeMap,
+ final ReferenceContext referenceContext) {
+ super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
+ this.pairModel = pairModel;
+ this.haplotypeMap = haplotypeMap;
+ this.refContext = referenceContext;
+ this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
+ }
+
+ // -------------------------------------------------------------------------------------
+ //
+ // add() routines. These are the workhorse routines for calculating the overall genotype
+ // likelihoods given observed bases and reads. Includes high-level operators all the
+ // way down to single base and qual functions.
+ //
+ // -------------------------------------------------------------------------------------
+
+ /**
+ * Updates likelihoods and posteriors to reflect the additional observations contained within the
+ * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
+ * pileup
+ *
+ * @param pileup read pileup
+ * @param UAC the minimum base quality at which to consider a base valid
+ * @return the number of good bases found in the pileup
+ */
+ public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
+ int n = 0;
+
+ if (!hasReferenceSampleData) {
+ // no error models
+ return add(pileup, (ErrorModel)null);
+ }
+ for (String laneID : perLaneErrorModels.keySet() ) {
+ // get pileup for this lane
+ ReadBackedPileup perLanePileup;
+ if (ignoreLaneInformation)
+ perLanePileup = pileup;
+ else
+ perLanePileup = pileup.getPileupForLane(laneID);
+
+ if (perLanePileup == null || perLanePileup.isEmpty())
+ continue;
+
+ ErrorModel errorModel = perLaneErrorModels.get(laneID);
+ n += add(perLanePileup, errorModel);
+ if (ignoreLaneInformation)
+ break;
+
+ }
+
+ return n;
+ }
+
+ /**
+ * Calculates the pool's probability for all possible allele counts for all indel alleles observed.
+ * Calculation is based on the error model
+ * generated by the reference sample on the same lane. The probability is given by :
+ *
+ * Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) *
+ * Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) *
+ * [j1 * (1-eq)/2n + eq/3*(2*N-j1)
+ * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
+ * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
+ *
+ * log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
+ * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
+ * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
+ * = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
+ * )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
+ * + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
+ *
+ * Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
+ *
+ * Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
+ * nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
+ *
+ * If pileup data comes from several error models (because lanes can have different error models),
+ * Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
+ * = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
+ *
+ * So, for each lane, build error model and combine lanes.
+ * To store model, can do
+ * for jA=0:2N
+ * for jC = 0:2N-jA
+ * for jG = 0:2N-jA-jC
+ * for jT = 0:2N-jA-jC-jG
+ * Q(jA,jC,jG,jT)
+ * for k = minSiteQual:maxSiteQual
+ * likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
+ *
+ *
+ *
+ * where: nA,nC,nG,nT = counts of bases observed in pileup.
+ *
+ *
+ * @param pileup Base pileup
+ * @param errorModel Site error model
+ * @return Number of bases added
+ */
+ private int add(ReadBackedPileup pileup, ErrorModel errorModel) {
+ int n=0;
+
+ // Number of alleless in pileup, in that order
+ List numSeenBases = new ArrayList(this.alleles.size());
+
+ if (!hasReferenceSampleData) {
+ final int numHaplotypes = haplotypeMap.size();
+
+ final int readCounts[] = new int[pileup.getNumberOfElements()];
+ readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, PoolIndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
+ n = readHaplotypeLikelihoods.length;
+ } else {
+ Allele refAllele = null;
+ for (Allele a:alleles) {
+ numSeenBases.add(0);
+ if (a.isReference())
+ refAllele = a;
+ }
+
+ if (refAllele == null)
+ throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!");
+
+ // count number of elements in pileup
+ for (PileupElement elt : pileup) {
+ if (VERBOSE)
+ System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getEventBases(),elt.getEventLength());
+ int idx =0;
+ for (Allele allele : alleles) {
+ int cnt = numSeenBases.get(idx);
+ numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele)?1:0));
+ }
+
+ n++;
+
+ }
+ }
+ computeLikelihoods(errorModel, alleles, numSeenBases, pileup);
+ return n;
+ }
+
+
+
+ /**
+ * Compute likelihood of current conformation
+ *
+ * @param ACset Count to compute
+ * @param errorModel Site-specific error model object
+ * @param alleleList List of alleles
+ * @param numObservations Number of observations for each allele in alleleList
+ */
+ public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
+ final ErrorModel errorModel,
+ final List alleleList,
+ final List numObservations,
+ final ReadBackedPileup pileup) {
+ final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
+ double p1 = 0.0;
+
+ if (!hasReferenceSampleData) {
+ // no error model: use pair HMM likelihoods
+ for (int i=0; i < readHaplotypeLikelihoods.length; i++) {
+ double acc[] = new double[alleleList.size()];
+ for (int k=0; k < acc.length; k++ )
+ acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY;
+ p1 += MathUtils.log10sumLog10(acc);
+ }
+
+ } else {
+ final int minQ = errorModel.getMinSignificantQualityScore();
+ final int maxQ = errorModel.getMaxSignificantQualityScore();
+ final double[] acVec = new double[maxQ - minQ + 1];
+
+
+ for (int k=minQ; k<=maxQ; k++) {
+ int idx=0;
+ for (int n : numObservations)
+ acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k];
+ }
+ p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
+ }
+ ACset.log10Likelihoods[0] = p1;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java
new file mode 100644
index 000000000..c2bac4455
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2010.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.variantcontext.*;
+
+import java.util.*;
+
+public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
+ private static final int MAX_NUM_ALLELES_TO_GENOTYPE = 4;
+
+ private PairHMMIndelErrorModel pairModel;
+ private boolean allelesArePadded = false;
+ /*
+ private static ThreadLocal>> indelLikelihoodMap =
+ new ThreadLocal>>() {
+ protected synchronized HashMap> initialValue() {
+ return new HashMap>();
+ }
+ };
+ */
+
+ private LinkedHashMap haplotypeMap;
+
+ /*
+ static {
+ indelLikelihoodMap.set(new HashMap>());
+ }
+ */
+
+ protected PoolIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) {
+ super(UAC, logger);
+
+
+ pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
+ UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
+ haplotypeMap = new LinkedHashMap();
+ }
+
+
+ public static HashMap