diff --git a/build.xml b/build.xml
index aa5792419..834aef3cd 100644
--- a/build.xml
+++ b/build.xml
@@ -185,10 +185,7 @@
-
-
-
-
+
@@ -205,6 +202,16 @@
+
+
+
+
+
+
+
+
+
+
@@ -226,20 +233,20 @@
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
@@ -327,14 +334,18 @@
-
+
-
+
+
+
+
+
@@ -668,6 +679,24 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -842,19 +871,23 @@
-
+
-
+
-
+
-
+
+
+
+
+
@@ -906,13 +939,15 @@
-
+
-
+
-
+
-
+
+
+
@@ -975,6 +1010,8 @@
+
+
@@ -1091,15 +1128,10 @@
-
-
+
-
-
-
-
@@ -1107,9 +1139,6 @@
-
-
-
@@ -1117,10 +1146,8 @@
-
-
+
-
@@ -1131,11 +1158,9 @@
-
+
-
-
@@ -1148,9 +1173,8 @@
-
+
-
@@ -1192,14 +1216,16 @@
+
+ listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
@@ -1355,14 +1381,13 @@
-
+
-
@@ -1380,13 +1405,27 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/ivy.xml b/ivy.xml
index 1d2f95dc1..b7ca65406 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -24,11 +24,8 @@
-
+
-
-
-
@@ -83,9 +80,9 @@
-
-
-
+
+
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
index 59357e1c4..94f6ff649 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java
@@ -56,18 +56,20 @@ public class AlleleBiasedDownsamplingUtils {
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i] = new ArrayList();
+ // keep all of the reduced reads
+ final ArrayList reducedReadPileups = new ArrayList();
+
// start by stratifying the reads by the alleles they represent at this position
for( final PileupElement pe : pileup ) {
- // abort if we have a reduced read - we do not want to remove it!
+ // we do not want to remove a reduced read
if ( pe.getRead().isReducedRead() )
- return pileup;
+ reducedReadPileups.add(pe);
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
if ( baseIndex != -1 )
alleleStratifiedElements[baseIndex].add(pe);
}
- // Down-sample *each* allele by the contamination fraction applied to the entire pileup.
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
final TreeSet elementsToKeep = new TreeSet(new Comparator() {
@@ -77,13 +79,23 @@ public class AlleleBiasedDownsamplingUtils {
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
}
});
+ elementsToKeep.addAll(reducedReadPileups);
+
+ // make a listing of allele counts
+ final int[] alleleCounts = new int[4];
+ for ( int i = 0; i < 4; i++ )
+ alleleCounts[i] = alleleStratifiedElements[i].size();
+
+ // do smart down-sampling
+ final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
for ( int i = 0; i < 4; i++ ) {
final ArrayList alleleList = alleleStratifiedElements[i];
- if ( alleleList.size() <= numReadsToRemove )
- logAllElements(alleleList, log);
+ // if we don't need to remove any reads, keep them all
+ if ( alleleList.size() <= targetAlleleCounts[i] )
+ elementsToKeep.addAll(alleleList);
else
- elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log));
+ elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
}
// clean up pointers so memory can be garbage collected if needed
@@ -93,6 +105,66 @@ public class AlleleBiasedDownsamplingUtils {
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep));
}
+ private static int scoreAlleleCounts(final int[] alleleCounts) {
+ if ( alleleCounts.length < 2 )
+ return 0;
+
+ // sort the counts (in ascending order)
+ final int[] alleleCountsCopy = alleleCounts.clone();
+ Arrays.sort(alleleCountsCopy);
+
+ final int maxCount = alleleCountsCopy[alleleCounts.length - 1];
+ final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2];
+
+ int remainderCount = 0;
+ for ( int i = 0; i < alleleCounts.length - 2; i++ )
+ remainderCount += alleleCountsCopy[i];
+
+ // try to get the best score:
+ // - in the het case the counts should be equal with nothing else
+ // - in the hom case the non-max should be zero
+ return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount));
+ }
+
+ /**
+ * Computes an allele biased version of the given pileup
+ *
+ * @param alleleCounts the original pileup
+ * @param numReadsToRemove fraction of total reads to remove per allele
+ * @return allele biased pileup
+ */
+ protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
+ final int numAlleles = alleleCounts.length;
+
+ int maxScore = scoreAlleleCounts(alleleCounts);
+ int[] alleleCountsOfMax = alleleCounts;
+
+ final int numReadsToRemovePerAllele = numReadsToRemove / 2;
+
+ for ( int i = 0; i < numAlleles; i++ ) {
+ for ( int j = i; j < numAlleles; j++ ) {
+ final int[] newCounts = alleleCounts.clone();
+
+ // split these cases so we don't lose on the floor (since we divided by 2)
+ if ( i == j ) {
+ newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove);
+ } else {
+ newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele);
+ newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele);
+ }
+
+ final int score = scoreAlleleCounts(newCounts);
+
+ if ( score < maxScore ) {
+ maxScore = score;
+ alleleCountsOfMax = newCounts;
+ }
+ }
+ }
+
+ return alleleCountsOfMax;
+ }
+
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
*
@@ -102,7 +174,15 @@ public class AlleleBiasedDownsamplingUtils {
* @return the list of pileup elements TO KEEP
*/
private static List downsampleElements(final ArrayList elements, final int numElementsToRemove, final PrintStream log) {
+ if ( numElementsToRemove == 0 )
+ return elements;
+
final int pileupSize = elements.size();
+ if ( numElementsToRemove == pileupSize ) {
+ logAllElements(elements, log);
+ return new ArrayList(0);
+ }
+
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
@@ -132,15 +212,25 @@ public class AlleleBiasedDownsamplingUtils {
for ( final List reads : alleleReadMap.values() )
totalReads += reads.size();
- // Down-sample *each* allele by the contamination fraction applied to the entire pileup.
int numReadsToRemove = (int)(totalReads * downsamplingFraction);
- final List readsToRemove = new ArrayList(numReadsToRemove * alleleReadMap.size());
- for ( final List reads : alleleReadMap.values() ) {
- if ( reads.size() <= numReadsToRemove ) {
- readsToRemove.addAll(reads);
- logAllReads(reads, log);
- } else {
- readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log));
+
+ // make a listing of allele counts
+ final List alleles = new ArrayList(alleleReadMap.keySet());
+ alleles.remove(Allele.NO_CALL); // ignore the no-call bin
+ final int numAlleles = alleles.size();
+ final int[] alleleCounts = new int[numAlleles];
+ for ( int i = 0; i < numAlleles; i++ )
+ alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
+
+ // do smart down-sampling
+ final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
+
+ final List readsToRemove = new ArrayList(numReadsToRemove);
+ for ( int i = 0; i < numAlleles; i++ ) {
+ final List alleleBin = alleleReadMap.get(alleles.get(i));
+
+ if ( alleleBin.size() > targetAlleleCounts[i] ) {
+ readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
}
}
@@ -156,13 +246,22 @@ public class AlleleBiasedDownsamplingUtils {
* @return the list of pileup elements TO REMOVE
*/
private static List downsampleReads(final List reads, final int numElementsToRemove, final PrintStream log) {
+ final ArrayList readsToRemove = new ArrayList(numElementsToRemove);
+
+ if ( numElementsToRemove == 0 )
+ return readsToRemove;
+
final int pileupSize = reads.size();
+ if ( numElementsToRemove == pileupSize ) {
+ logAllReads(reads, log);
+ return reads;
+ }
+
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
- ArrayList readsToRemove = new ArrayList(pileupSize - numElementsToRemove);
for ( int i = 0; i < pileupSize; i++ ) {
if ( itemsToRemove.get(i) ) {
final GATKSAMRecord read = reads.get(i);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java
new file mode 100644
index 000000000..a6e5b6c5b
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java
@@ -0,0 +1,59 @@
+package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+
+import org.broadinstitute.sting.utils.GenomeLocComparator;
+
+import java.util.Collection;
+import java.util.TreeSet;
+
+/**
+ * A stash of regions that must be kept uncompressed in all samples
+ *
+ * In general, these are regions that were kept uncompressed by a tumor sample and we want to force
+ * all other samples (normals and/or tumors) to also keep these regions uncompressed
+ *
+ * User: carneiro
+ * Date: 10/15/12
+ * Time: 4:08 PM
+ */
+public class CompressionStash extends TreeSet {
+ public CompressionStash() {
+ super(new GenomeLocComparator());
+ }
+
+ /**
+ * Adds a SimpleGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc
+ * in the stash.
+ *
+ * @param insertLoc the new loc to be inserted
+ * @return true if the loc, or it's merged version, wasn't present in the list before.
+ */
+ @Override
+ public boolean add(SimpleGenomeLoc insertLoc) {
+ TreeSet removedLocs = new TreeSet();
+ for (SimpleGenomeLoc existingLoc : this) {
+ if (existingLoc.isPast(insertLoc)) {
+ break; // if we're past the loc we're done looking for overlaps.
+ }
+ if (existingLoc.equals(insertLoc)) {
+ return false; // if this loc was already present in the stash, we don't need to insert it.
+ }
+ if (existingLoc.contiguousP(insertLoc)) {
+ removedLocs.add(existingLoc); // list the original loc for merging
+ }
+ }
+ for (SimpleGenomeLoc loc : removedLocs) {
+ this.remove(loc); // remove all locs that will be merged
+ }
+ removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged
+ return super.add(SimpleGenomeLoc.merge(removedLocs)); // merge them all into one loc and add to the stash
+ }
+
+ @Override
+ public boolean addAll(Collection extends SimpleGenomeLoc> locs) {
+ boolean result = false;
+ for (SimpleGenomeLoc loc : locs) {
+ result |= this.add(loc);
+ }
+ return result;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
index 7c9fc101b..f348225ca 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java
@@ -3,13 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.HashMap;
import java.util.Map;
-import java.util.SortedSet;
+import java.util.Set;
import java.util.TreeSet;
/*
@@ -41,7 +42,7 @@ import java.util.TreeSet;
*
* @author depristo
*/
-public class MultiSampleCompressor implements Compressor {
+public class MultiSampleCompressor {
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
protected Map compressorsPerSample = new HashMap();
@@ -63,21 +64,36 @@ public class MultiSampleCompressor implements Compressor {
}
}
- @Override
- public Iterable addAlignment(GATKSAMRecord read) {
- String sample = read.getReadGroup().getSample();
- SingleSampleCompressor compressor = compressorsPerSample.get(sample);
+ public Set addAlignment(GATKSAMRecord read) {
+ String sampleName = read.getReadGroup().getSample();
+ SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
if ( compressor == null )
- throw new ReviewedStingException("No compressor for sample " + sample);
- return compressor.addAlignment(read);
+ throw new ReviewedStingException("No compressor for sample " + sampleName);
+ Pair, CompressionStash> readsAndStash = compressor.addAlignment(read);
+ Set reads = readsAndStash.getFirst();
+ CompressionStash regions = readsAndStash.getSecond();
+
+ reads.addAll(closeVariantRegionsInAllSamples(regions));
+
+ return reads;
}
- @Override
- public Iterable close() {
- SortedSet reads = new TreeSet(new AlignmentStartWithNoTiesComparator());
- for ( SingleSampleCompressor comp : compressorsPerSample.values() )
- for ( GATKSAMRecord read : comp.close() )
- reads.add(read);
+ public Set close() {
+ Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator());
+ for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
+ Pair, CompressionStash> readsAndStash = sample.close();
+ reads = readsAndStash.getFirst();
+ }
+ return reads;
+ }
+
+ private Set closeVariantRegionsInAllSamples(CompressionStash regions) {
+ Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator());
+ if (!regions.isEmpty()) {
+ for (SingleSampleCompressor sample : compressorsPerSample.values()) {
+ reads.addAll(sample.closeVariantRegions(regions));
+ }
+ }
return reads;
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
index 5810bc94f..39a284d98 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java
@@ -25,6 +25,9 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileWriter;
+import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.util.SequenceUtil;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
@@ -45,6 +48,7 @@ import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@@ -81,12 +85,13 @@ import java.util.*;
*/
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
-@PartitionBy(PartitionType.INTERVAL)
+@PartitionBy(PartitionType.CONTIG)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
public class ReduceReads extends ReadWalker, ReduceReadsStash> {
@Output
- private StingSAMFileWriter out;
+ private StingSAMFileWriter out = null;
+ private SAMFileWriter writerToUse = null;
/**
* The number of bases to keep around mismatches (potential variation)
@@ -196,6 +201,10 @@ public class ReduceReads extends ReadWalker, ReduceRea
@Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false)
private int nContigs = 2;
+ @Hidden
+ @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
+ private boolean nwayout = false;
+
@Hidden
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
private int debugLevel = 0;
@@ -222,9 +231,12 @@ public class ReduceReads extends ReadWalker, ReduceRea
HashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
+ CompressionStash compressionStash = new CompressionStash();
+
SortedSet intervalList;
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
+ private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam";
/**
* Basic generic initialization of the readNameHash and the intervalList. Output initialization
@@ -240,10 +252,22 @@ public class ReduceReads extends ReadWalker, ReduceRea
if (toolkit.getIntervals() != null)
intervalList.addAll(toolkit.getIntervals());
- if (!NO_PG_TAG)
- Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME);
- else
+
+ final boolean preSorted = true;
+ final boolean indexOnTheFly = true;
+ final boolean keep_records = true;
+ final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
+ if (nwayout) {
+ SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
+ writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true);
+ }
+ else {
+ writerToUse = out;
out.setPresorted(false);
+ if (!NO_PG_TAG) {
+ Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
+ }
+ }
}
/**
@@ -276,7 +300,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
// Check if the read goes beyond the boundaries of the chromosome, and hard clip those boundaries.
int chromosomeLength = ref.getGenomeLocParser().getContigInfo(read.getReferenceName()).getSequenceLength();
if (read.getSoftStart() < 0)
- read = ReadClipper.hardClipByReadCoordinates(read, 0, -read.getSoftStart() - 1);
+ read = ReadClipper.hardClipByReadCoordinates(read, 0, -read.getSoftStart());
if (read.getSoftEnd() > chromosomeLength)
read = ReadClipper.hardClipByReadCoordinates(read, chromosomeLength - read.getSoftStart() + 1, read.getReadLength() - 1);
@@ -384,6 +408,9 @@ public class ReduceReads extends ReadWalker, ReduceRea
// output any remaining reads in the compressor
for (GATKSAMRecord read : stash.close())
outputRead(read);
+
+ if (nwayout)
+ writerToUse.close();
}
/**
@@ -552,7 +579,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
if (!DONT_COMPRESS_READ_NAMES)
compressReadName(read);
- out.addAlignment(read);
+ writerToUse.addAlignment(read);
}
/**
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
index 6a086c53b..ac3388795 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
@@ -1,8 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
+import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import java.util.Set;
import java.util.TreeSet;
/**
@@ -10,7 +12,7 @@ import java.util.TreeSet;
* @author carneiro, depristo
* @version 3.0
*/
-public class SingleSampleCompressor implements Compressor {
+public class SingleSampleCompressor {
final private int contextSize;
final private int downsampleCoverage;
final private int minMappingQuality;
@@ -24,6 +26,7 @@ public class SingleSampleCompressor implements Compressor {
private SlidingWindow slidingWindow;
private int slidingWindowCounter;
+ public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new TreeSet(), new CompressionStash());
public SingleSampleCompressor(final int contextSize,
final int downsampleCoverage,
@@ -46,12 +49,9 @@ public class SingleSampleCompressor implements Compressor {
this.allowPolyploidReduction = allowPolyploidReduction;
}
- /**
- * @{inheritDoc}
- */
- @Override
- public Iterable addAlignment( GATKSAMRecord read ) {
- TreeSet result = new TreeSet(new AlignmentStartWithNoTiesComparator());
+ public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) {
+ Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator());
+ CompressionStash stash = new CompressionStash();
int readOriginalStart = read.getUnclippedStart();
// create a new window if:
@@ -60,7 +60,9 @@ public class SingleSampleCompressor implements Compressor {
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
// close the current sliding window
- result.addAll(slidingWindow.close());
+ Pair, CompressionStash> readsAndStash = slidingWindow.close();
+ reads = readsAndStash.getFirst();
+ stash = readsAndStash.getSecond();
slidingWindow = null; // so we create a new one on the next if
}
@@ -69,13 +71,16 @@ public class SingleSampleCompressor implements Compressor {
slidingWindowCounter++;
}
- result.addAll(slidingWindow.addRead(read));
- return result;
+ stash.addAll(slidingWindow.addRead(read));
+ return new Pair, CompressionStash>(reads, stash);
}
- @Override
- public Iterable close() {
- return (slidingWindow != null) ? slidingWindow.close() : new TreeSet();
+ public Pair, CompressionStash> close() {
+ return (slidingWindow != null) ? slidingWindow.close() : emptyPair;
+ }
+
+ public Set closeVariantRegions(CompressionStash regions) {
+ return slidingWindow.closeVariantRegions(regions);
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
index 32abe8ef6..fff1c20a5 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
@@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.EventType;
+import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@@ -57,6 +58,8 @@ public class SlidingWindow {
private boolean allowPolyploidReductionInGeneral;
+ private static CompressionStash emptyRegions = new CompressionStash();
+
/**
* The types of synthetic reads to use in the finalizeAndAdd method
*/
@@ -137,7 +140,7 @@ public class SlidingWindow {
* @param read the read
* @return a list of reads that have been finished by sliding the window.
*/
- public List addRead(GATKSAMRecord read) {
+ public CompressionStash addRead(GATKSAMRecord read) {
addToHeader(windowHeader, read); // update the window header counts
readsInWindow.add(read); // add read to sliding reads
return slideWindow(read.getUnclippedStart());
@@ -151,8 +154,9 @@ public class SlidingWindow {
* @param variantSite boolean array with true marking variant regions
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
*/
- private Pair getNextVariantRegion(int from, int to, boolean[] variantSite) {
+ private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean forceClose) {
boolean foundStart = false;
+ final int windowHeaderStart = getStartLocation(windowHeader);
int variantRegionStartIndex = 0;
for (int i=from; i(variantRegionStartIndex, i-1));
+ return(new SimpleGenomeLoc(contig, contigIndex, windowHeaderStart + variantRegionStartIndex, windowHeaderStart + i - 1, true));
}
}
- return (foundStart) ? new Pair(variantRegionStartIndex, -1) : null;
+ final int refStart = windowHeaderStart + variantRegionStartIndex;
+ final int refStop = windowHeaderStart + to - 1;
+ return (foundStart && forceClose) ? new SimpleGenomeLoc(contig, contigIndex, refStart, refStop, true) : null;
}
/**
@@ -172,25 +178,25 @@ public class SlidingWindow {
* @param from beginning window header index of the search window (inclusive)
* @param to end window header index of the search window (exclusive)
* @param variantSite boolean array with true marking variant regions
- * @return a list with start/stops of variant regions following getNextVariantRegion description
+ * @return a list with start/stops of variant regions following findNextVariantRegion description
*/
- private List> getAllVariantRegions(int from, int to, boolean[] variantSite) {
- List> regions = new LinkedList>();
+ private CompressionStash findVariantRegions(int from, int to, boolean[] variantSite, boolean forceClose) {
+ CompressionStash regions = new CompressionStash();
int index = from;
while(index < to) {
- Pair result = getNextVariantRegion(index, to, variantSite);
+ SimpleGenomeLoc result = findNextVariantRegion(index, to, variantSite, forceClose);
if (result == null)
break;
regions.add(result);
- if (result.getSecond() < 0)
+ if (!result.isFinished())
break;
- index = result.getSecond() + 1;
+
+ index = result.getStop() + 1;
}
return regions;
}
-
/**
* Determines if the window can be slid given the new incoming read.
*
@@ -201,25 +207,24 @@ public class SlidingWindow {
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
* @return all reads that have fallen to the left of the sliding window after the slide
*/
- protected List slideWindow(final int incomingReadUnclippedStart) {
- List finalizedReads = new LinkedList();
-
+ protected CompressionStash slideWindow(final int incomingReadUnclippedStart) {
final int windowHeaderStartLocation = getStartLocation(windowHeader);
+ CompressionStash regions = emptyRegions;
+ boolean forceClose = true;
if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
markSites(incomingReadUnclippedStart);
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
- List> regions = getAllVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet());
- finalizedReads = closeVariantRegions(regions, false);
-
- while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
- readsInWindow.pollFirst();
- }
+ regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose);
}
- return finalizedReads;
+ while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
+ readsInWindow.pollFirst();
+ }
+
+ return regions;
}
@@ -601,9 +606,7 @@ public class SlidingWindow {
toRemove.add(read);
}
}
- for (GATKSAMRecord read : toRemove) {
- readsInWindow.remove(read);
- }
+ removeReadsFromWindow(toRemove);
}
return allReads;
}
@@ -623,26 +626,27 @@ public class SlidingWindow {
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
- return result; // finalized reads will be downsampled if necessary
+ return result; // finalized reads will be downsampled if necessary
}
-
- private List closeVariantRegions(List> regions, boolean forceClose) {
- List allReads = new LinkedList();
+ public Set closeVariantRegions(CompressionStash regions) {
+ TreeSet allReads = new TreeSet(new AlignmentStartWithNoTiesComparator());
if (!regions.isEmpty()) {
int lastStop = -1;
- for (Pair region : regions) {
- int start = region.getFirst();
- int stop = region.getSecond();
- if (stop < 0 && forceClose)
- stop = windowHeader.size() - 1;
- if (stop >= 0) {
- allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
+ int windowHeaderStart = getStartLocation(windowHeader);
+
+ for (SimpleGenomeLoc region : regions) {
+ if (region.isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() <= windowHeaderStart + windowHeader.size()) {
+ int start = region.getStart() - windowHeaderStart;
+ int stop = region.getStop() - windowHeaderStart;
+
+ allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track
lastStop = stop;
}
}
- for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
- windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
+
+ for (int i = 0; i <= lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
+ windowHeader.remove();
}
return allReads;
}
@@ -676,23 +680,24 @@ public class SlidingWindow {
*
* @return All reads generated
*/
- public List close() {
+ public Pair, CompressionStash> close() {
// mark variant regions
- List finalizedReads = new LinkedList();
+ Set finalizedReads = new TreeSet(new AlignmentStartWithNoTiesComparator());
+ CompressionStash regions = new CompressionStash();
+ boolean forceCloseUnfinishedRegions = true;
if (!windowHeader.isEmpty()) {
markSites(getStopLocation(windowHeader) + 1);
- List> regions = getAllVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
- finalizedReads = closeVariantRegions(regions, true);
+ regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions);
+ finalizedReads = closeVariantRegions(regions);
if (!windowHeader.isEmpty()) {
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
}
-
}
- return finalizedReads;
+ return new Pair, CompressionStash>(finalizedReads, regions);
}
/**
@@ -797,9 +802,8 @@ public class SlidingWindow {
hetReads.add(finalizeRunningConsensus());
}
- for (GATKSAMRecord read : toRemove) {
- readsInWindow.remove(read);
- }
+ removeReadsFromWindow(toRemove);
+
return hetReads;
}
@@ -916,5 +920,11 @@ public class SlidingWindow {
}
}
}
+
+ private void removeReadsFromWindow (List readsToRemove) {
+ for (GATKSAMRecord read : readsToRemove) {
+ readsInWindow.remove(read);
+ }
+ }
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java
index e9ed6b153..0a3512aa6 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java
@@ -240,8 +240,8 @@ public class AFCalcPerformanceTest {
if ( a.isNonReference() ) {
final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : "";
logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE);
- final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : "";
- logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost);
+ final String warningmePost = call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) == 0 && result.getLog10PosteriorOfAFEq0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : "";
+ logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFEq0ForAllele(a) + warningmePost);
}
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index d91df82e2..4d81d0010 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -30,175 +30,44 @@ import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import org.apache.commons.lang.ArrayUtils;
+import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
-import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
-import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.*;
+import java.io.PrintStream;
import java.util.*;
public class GenotypingEngine {
private final boolean DEBUG;
- private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false);
+ private final VariantAnnotatorEngine annotationEngine;
- public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
+ public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine ) {
this.DEBUG = DEBUG;
- this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
+ this.annotationEngine = annotationEngine;
noCall.add(Allele.NO_CALL);
}
- // WARN
- // This function is the streamlined approach, currently not being used by default
- // WARN
- // WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
- // WARN
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
- public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
- final ArrayList haplotypes,
- final byte[] ref,
- final GenomeLoc refLoc,
- final GenomeLoc activeRegionWindow,
- final GenomeLocParser genomeLocParser ) {
- // Prepare the list of haplotype indices to genotype
- final ArrayList allelesToGenotype = new ArrayList();
+ public List assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
+ final List haplotypes,
+ final List samples,
+ final Map haplotypeReadMap,
+ final Map> perSampleFilteredReadList,
+ final byte[] ref,
+ final GenomeLoc refLoc,
+ final GenomeLoc activeRegionWindow,
+ final GenomeLocParser genomeLocParser,
+ final List activeAllelesToGenotype ) {
- for( final Haplotype h : haplotypes ) {
- allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) );
- }
- final int numHaplotypes = haplotypes.size();
-
- // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
- final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
- for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
- final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
- final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample);
- int glIndex = 0;
- for( int iii = 0; iii < numHaplotypes; iii++ ) {
- for( int jjj = 0; jjj <= iii; jjj++ ) {
- genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
- }
- }
- genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make());
- }
- final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
- if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here
-
- // Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF
- final ArrayList haplotypesToRemove = new ArrayList();
- for( final Haplotype h : haplotypes ) {
- if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF
- haplotypesToRemove.add(h);
- }
- }
- haplotypes.removeAll(haplotypesToRemove);
-
- if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
- final List>>> returnVCs = new ArrayList>>>();
- // set up the default 1-to-1 haplotype mapping object
- final HashMap> haplotypeMapping = new HashMap>();
- for( final Haplotype h : haplotypes ) {
- final ArrayList list = new ArrayList();
- list.add(h);
- haplotypeMapping.put(call.getAllele(h.getBases()), list);
- }
- returnVCs.add( new Pair>>(call,haplotypeMapping) );
- return returnVCs;
- }
-
- final ArrayList>>> returnCalls = new ArrayList>>>();
-
- // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
- final TreeSet startPosKeySet = new TreeSet();
- int count = 0;
- if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
- for( final Haplotype h : haplotypes ) {
- if( DEBUG ) {
- System.out.println( h.toString() );
- System.out.println( "> Cigar = " + h.getCigar() );
- }
- // Walk along the alignment and turn any difference from the reference into an event
- h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
- startPosKeySet.addAll(h.getEventMap().keySet());
- }
-
- // Create the VC merge priority list
- final ArrayList priorityList = new ArrayList();
- for( int iii = 0; iii < haplotypes.size(); iii++ ) {
- priorityList.add("HC" + iii);
- }
-
- // Walk along each position in the key set and create each event to be outputted
- for( final int loc : startPosKeySet ) {
- if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
- final ArrayList eventsAtThisLoc = new ArrayList();
- for( final Haplotype h : haplotypes ) {
- final HashMap eventMap = h.getEventMap();
- final VariantContext vc = eventMap.get(loc);
- if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
- eventsAtThisLoc.add(vc);
- }
- }
-
- // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
- final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
-
- // Merge the event to find a common reference representation
- final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
-
- final HashMap> alleleHashMap = new HashMap>();
- int aCount = 0;
- for( final Allele a : mergedVC.getAlleles() ) {
- alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
- }
-
- if( DEBUG ) {
- System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
- //System.out.println("Event/haplotype allele mapping = " + alleleMapper);
- }
-
- // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
- final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
- for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
- final int myNumHaplotypes = alleleMapper.size();
- final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2];
- final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
- int glIndex = 0;
- for( int iii = 0; iii < myNumHaplotypes; iii++ ) {
- for( int jjj = 0; jjj <= iii; jjj++ ) {
- genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
- }
- }
-
- // using the allele mapping object translate the haplotype allele into the event allele
- final Genotype g = new GenotypeBuilder(sample)
- .alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes))
- .phased(loc != startPosKeySet.first())
- .PL(genotypeLikelihoods).make();
- myGenotypes.add(g);
- }
- returnCalls.add( new Pair>>(
- new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) );
- }
- }
- return returnCalls;
- }
-
- // BUGBUG: Create a class to hold this complicated return type
- @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
- public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
- final ArrayList haplotypes,
- final byte[] ref,
- final GenomeLoc refLoc,
- final GenomeLoc activeRegionWindow,
- final GenomeLocParser genomeLocParser,
- final ArrayList activeAllelesToGenotype ) {
-
- final ArrayList>>> returnCalls = new ArrayList>>>();
+ final List returnCalls = new ArrayList();
+ final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet startPosKeySet = new TreeSet();
@@ -207,7 +76,7 @@ public class GenotypingEngine {
for( final Haplotype h : haplotypes ) {
// Walk along the alignment and turn any difference from the reference into an event
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
- if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
+ if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
if( DEBUG ) {
System.out.println( h.toString() );
System.out.println( "> Cigar = " + h.getCigar() );
@@ -217,10 +86,10 @@ public class GenotypingEngine {
}
cleanUpSymbolicUnassembledEvents( haplotypes );
- if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
- mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
+ if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
+ mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
}
- if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
+ if( in_GGA_mode ) {
for( final VariantContext compVC : activeAllelesToGenotype ) {
startPosKeySet.add( compVC.getStart() );
}
@@ -228,11 +97,11 @@ public class GenotypingEngine {
// Walk along each position in the key set and create each event to be outputted
for( final int loc : startPosKeySet ) {
- if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
+ if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view
final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view
- if( activeAllelesToGenotype.isEmpty() ) {
+ if( !in_GGA_mode ) {
for( final Haplotype h : haplotypes ) {
final HashMap eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc);
@@ -261,7 +130,14 @@ public class GenotypingEngine {
if( eventsAtThisLoc.isEmpty() ) { continue; }
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
- final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
+ Map> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
+
+ final Allele refAllele = eventsAtThisLoc.get(0).getReference();
+ final ArrayList alleleOrdering = new ArrayList(alleleMapper.size());
+ alleleOrdering.add(refAllele);
+ for( final VariantContext vc : eventsAtThisLoc ) {
+ alleleOrdering.add(vc.getAlternateAllele(0));
+ }
// Sanity check the priority list
for( final VariantContext vc : eventsAtThisLoc ) {
@@ -283,23 +159,29 @@ public class GenotypingEngine {
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
- HashMap> alleleHashMap = new HashMap>();
- int aCount = 0;
- for( final Allele a : mergedVC.getAlleles() ) {
- alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
+ // let's update the Allele keys in the mapper because they can change after merging when there are complex events
+ final Map> updatedAlleleMapper = new HashMap>(alleleMapper.size());
+ for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) {
+ final Allele oldAllele = alleleOrdering.get(i);
+ final Allele newAllele = mergedVC.getAlleles().get(i);
+ updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele));
+ alleleOrdering.set(i, newAllele);
}
+ alleleMapper = updatedAlleleMapper;
if( DEBUG ) {
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
}
+ final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
+
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
- final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
- for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
+ final GenotypesContext genotypes = GenotypesContext.create(samples.size());
+ for( final String sample : samples ) {
final int numHaplotypes = alleleMapper.size();
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
- final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
+ final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, alleleOrdering);
int glIndex = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
@@ -308,28 +190,58 @@ public class GenotypingEngine {
}
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
}
- VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
+ final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
if( call != null ) {
- if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
- final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
- // also, need to update the allele -> haplotype mapping
- final HashMap> alleleHashMapTrim = new HashMap>();
- for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
- alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
- }
+ final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap, perSampleFilteredReadList, call );
+ VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call);
- call = vcCallTrim;
- alleleHashMap = alleleHashMapTrim;
+ if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
+ annotatedCall = VariantContextUtils.reverseTrimAlleles(annotatedCall);
}
- returnCalls.add( new Pair>>(call, alleleHashMap) );
+ returnCalls.add( annotatedCall );
}
}
}
return returnCalls;
}
- protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes ) {
+ private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser,
+ final Map perSampleReadMap,
+ final Map> perSampleFilteredReadList,
+ final VariantContext call ) {
+
+ final Map returnMap = new HashMap();
+ final GenomeLoc callLoc = parser.createGenomeLoc(call);
+ for( final Map.Entry sample : perSampleReadMap.entrySet() ) {
+ final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
+
+ for( final Map.Entry> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) {
+ // only count the read if it overlaps the event, otherwise it is not added to the output read list at all
+ if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { // BUGBUG: This uses alignment start and stop, NOT soft start and soft end...
+ for( final Map.Entry alleleDoubleEntry : mapEntry.getValue().entrySet() ) {
+ likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue());
+ }
+ }
+ }
+
+ // add all filtered reads to the NO_CALL list because they weren't given any likelihoods
+ for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
+ // only count the read if it overlaps the event, otherwise it is not added to the output read list at all
+ if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
+ for( final Allele allele : call.getAlleles() ) {
+ likelihoodMap.add(read, allele, 0.0);
+ }
+ }
+ }
+
+ returnMap.put(sample.getKey(), likelihoodMap);
+ }
+ return returnMap;
+ }
+
+
+ protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) {
final ArrayList haplotypesToRemove = new ArrayList();
for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().values() ) {
@@ -348,7 +260,41 @@ public class GenotypingEngine {
haplotypes.removeAll(haplotypesToRemove);
}
- protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
+ // BUGBUG: ugh, too complicated
+ protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap,
+ final Map> alleleMapper,
+ final double downsamplingFraction,
+ final PrintStream downsamplingLog ) {
+
+ final Map alleleReadMap = new HashMap();
+ for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
+ final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
+ for( final Map.Entry> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
+ final List mappedHaplotypes = alleleMapperEntry.getValue();
+ for( final Map.Entry> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read
+ double maxLikelihood = Double.NEGATIVE_INFINITY;
+ for( final Map.Entry alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele
+ if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey().getBases())) ) { // exact match of haplotype base string
+ maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() );
+ }
+ }
+ perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
+ }
+ }
+ perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
+ alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
+ }
+
+ return alleleReadMap;
+ }
+
+ protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes,
+ final List samples,
+ final Map haplotypeReadMap,
+ final TreeSet startPosKeySet,
+ final byte[] ref,
+ final GenomeLoc refLoc ) {
+
final int MAX_SIZE_TO_COMBINE = 15;
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
if( startPosKeySet.size() <= 1 ) { return; }
@@ -392,10 +338,13 @@ public class GenotypingEngine {
}
}
// count up the co-occurrences of the events for the R^2 calculation
- final ArrayList haplotypeList = new ArrayList();
- haplotypeList.add(h);
- for( final String sample : haplotypes.get(0).getSampleKeySet() ) {
- final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0];
+ for( final String sample : samples ) {
+ final HashSet sampleSet = new HashSet(1);
+ sampleSet.add(sample);
+
+ final List alleleList = new ArrayList();
+ alleleList.add(Allele.create(h.getBases()));
+ final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeReadMap, alleleList )[0][0];
if( thisHapVC == null ) {
if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }
@@ -489,37 +438,87 @@ public class GenotypingEngine {
@Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
- protected static ArrayList> createAlleleMapper( final int loc, final ArrayList eventsAtThisLoc, final ArrayList haplotypes ) {
- final ArrayList> alleleMapper = new ArrayList>();
- final ArrayList refList = new ArrayList();
+ protected static Map> createAlleleMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) {
+
+ final Map> alleleMapper = new HashMap>(eventsAtThisLoc.size()+1);
+ final Allele refAllele = eventsAtThisLoc.get(0).getReference();
+ alleleMapper.put(refAllele, new ArrayList());
+ for( final VariantContext vc : eventsAtThisLoc )
+ alleleMapper.put(vc.getAlternateAllele(0), new ArrayList());
+
+ final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size());
for( final Haplotype h : haplotypes ) {
- if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
- refList.add(h);
+ if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) {
+ alleleMapper.get(h.getArtificialAllele()).add(h);
+ } else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later
+ undeterminedHaplotypes.add(h);
} else {
- boolean foundInEventList = false;
+ boolean haplotypeIsDetermined = false;
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
- foundInEventList = true;
+ alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h);
+ haplotypeIsDetermined = true;
+ break;
}
}
- if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype
- refList.add(h);
- }
+
+ if( !haplotypeIsDetermined )
+ undeterminedHaplotypes.add(h);
}
}
- alleleMapper.add(refList);
- for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
- final ArrayList list = new ArrayList();
- for( final Haplotype h : haplotypes ) {
- if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
- list.add(h);
+
+ for( final Haplotype h : undeterminedHaplotypes ) {
+ Allele matchingAllele = null;
+ for( final Map.Entry> alleleToTest : alleleMapper.entrySet() ) {
+ // don't test against the reference allele
+ if( alleleToTest.getKey().equals(refAllele) )
+ continue;
+
+ final Haplotype artificialHaplotype = alleleToTest.getValue().get(0);
+ if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
+ matchingAllele = alleleToTest.getKey();
+ break;
}
}
- alleleMapper.add(list);
+
+ if( matchingAllele == null )
+ matchingAllele = refAllele;
+ alleleMapper.get(matchingAllele).add(h);
}
+
return alleleMapper;
}
+ protected static boolean isSubSetOf(final Map subset, final Map superset, final boolean resolveSupersetToSubset) {
+
+ for ( final Map.Entry fromSubset : subset.entrySet() ) {
+ final VariantContext fromSuperset = superset.get(fromSubset.getKey());
+ if ( fromSuperset == null )
+ return false;
+
+ List supersetAlleles = fromSuperset.getAlternateAlleles();
+ if ( resolveSupersetToSubset )
+ supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
+
+ if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
+ return false;
+ }
+
+ return true;
+ }
+
+ private static List resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List currentAlleles) {
+ if ( targetReference.length() <= actualReference.length() )
+ return currentAlleles;
+
+ final List newAlleles = new ArrayList(currentAlleles.size());
+ final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
+ for ( final Allele a : currentAlleles ) {
+ newAlleles.add(Allele.extend(a, extraBases));
+ }
+ return newAlleles;
+ }
+
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final ArrayList> alleleMapper, final ArrayList haplotypes ) {
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 5aba23faa..35aa86ca2 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
-import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@@ -41,8 +40,12 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
-import org.broadinstitute.sting.gatk.walkers.genotyper.*;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.codecs.vcf.*;
@@ -129,14 +132,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
protected int MIN_PRUNE_FACTOR = 1;
- @Advanced
- @Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
- protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
-
- @Advanced
- @Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
- protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
-
@Advanced
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
protected int gcpHMM = 10;
@@ -208,11 +203,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
// the genotyping engine
private GenotypingEngine genotypingEngine = null;
- // the annotation engine
- private VariantAnnotatorEngine annotationEngine;
-
// fasta reference reader to supplement the edges of the reference sequence
- private IndexedFastaSequenceFile referenceReader;
+ private CachingIndexedFastaSequenceFile referenceReader;
// reference base padding size
private static final int REFERENCE_PADDING = 900;
@@ -246,15 +238,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
- simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
- simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
- simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
- simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
+ simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
+ simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
+ simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
+ simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
+ simpleUAC.CONTAMINATION_FRACTION = 0.0;
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// initialize the output VCF header
- annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
+ final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
Set headerInfo = new HashSet();
@@ -271,15 +264,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
VCFConstants.GENOTYPE_QUALITY_KEY,
VCFConstants.DEPTH_KEY,
VCFConstants.GENOTYPE_PL_KEY);
- // header lines for the experimental HaplotypeCaller-specific annotations
- headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
- headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
- headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
- headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
- headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
- headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
- headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
- headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
// where the filters are used. For example, in emitting all sites the lowQual field is used
@@ -296,7 +280,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
- genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
+ genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine );
}
//---------------------------------------------------------------------------------------------------------------
@@ -309,9 +293,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
- // enable non primary reads in the active region
+ // enable non primary and extended reads in the active region
@Override
- public boolean wantsNonPrimaryReads() { return true; }
+ public EnumSet desiredReadStates() {
+ return EnumSet.of(
+ ActiveRegionReadState.PRIMARY,
+ ActiveRegionReadState.NONPRIMARY,
+ ActiveRegionReadState.EXTENDED
+ );
+ }
@Override
@Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"})
@@ -324,15 +314,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
}
}
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
- return new ActivityProfileResult(1.0);
+ return new ActivityProfileResult(ref.getLocus(), 1.0);
}
}
if( USE_ALLELES_TRIGGER ) {
- return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
+ return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
}
- if( context == null ) { return new ActivityProfileResult(0.0); }
+ if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); }
final List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied
noCall.add(Allele.NO_CALL);
@@ -369,7 +359,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
- return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
+ return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
}
//---------------------------------------------------------------------------------------------------------------
@@ -412,60 +402,23 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
// evaluate each sample's reads against all haplotypes
- final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
- final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads );
- likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList );
+ final Map