From 45da892ecc9d5da28dd28e7c9d6de1b8aab8cb9d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 31 Jan 2012 18:34:53 -0500 Subject: [PATCH 01/11] Better exceptions to catch malformed reads * throw exceptions in LocusIteratorByState when hitting reads starting or ending with deletions --- .../sting/gatk/iterators/LocusIteratorByState.java | 7 +++++-- .../broadinstitute/sting/utils/pileup/PileupElement.java | 8 ++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 53144671c..316a20a70 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -193,6 +193,9 @@ public class LocusIteratorByState extends LocusIterator { // we reenter in order to re-check cigarElementCounter against curElement's length return stepForwardOnGenome(); } else { + if (curElement != null && curElement.getOperator() == CigarOperator.D) + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString()); + // Reads that contain indels model the genomeOffset as the following base in the reference. Because // we fall into this else block only when indels end the read, increment genomeOffset such that the // current offset of this read is the next ref base after the end of the indel. This position will @@ -228,7 +231,7 @@ public class LocusIteratorByState extends LocusIterator { // we see insertions only once, when we step right onto them; the position on the read is scrolled // past the insertion right after that if (eventDelayedFlag > 1) - throw new UserException.MalformedBAM(read, "Adjacent I/D events in read " + read.getReadName()); + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength()); eventLength = curElement.getLength(); eventStart = readOffset; @@ -247,7 +250,7 @@ public class LocusIteratorByState extends LocusIterator { // generate an extended event only if we just stepped into the deletion (i.e. don't // generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!) if (eventDelayedFlag > 1) - throw new UserException.MalformedBAM(read, "Adjacent I/D events in read " + read.getReadName()); + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); eventLength = curElement.getLength(); eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only eventStart = readOffset; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index d67261ba2..9e2a66f6e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -4,6 +4,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** @@ -146,9 +147,12 @@ public class PileupElement implements Comparable { public int getRepresentativeCount() { int representativeCount = 1; - if (read.isReducedRead() && !isInsertionAtBeginningOfRead()) - representativeCount = (isDeletion()) ? Math.round((read.getReducedCount(offset) + read.getReducedCount(offset + 1)) / 2) : read.getReducedCount(offset); + if (read.isReducedRead() && !isInsertionAtBeginningOfRead()) { + if (isDeletion() && (offset + 1 >= read.getReadLength()) ) // deletion in the end of the read + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); + representativeCount = (isDeletion()) ? Math.round((read.getReducedCount(offset) + read.getReducedCount(offset + 1)) / 2) : read.getReducedCount(offset); + } return representativeCount; } From 30b937d2af93f5f943c1470a6cb8c7523a68acca Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 1 Feb 2012 16:06:22 -0500 Subject: [PATCH 05/11] Fix bug discovered in FGTP branch in which BlockInputStream returns -1 in cases where some data could be read, but not all the data requested by the caller. --- .../gatk/datasources/reads/BlockInputStream.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java index a95eb7b8e..cb37bad31 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BlockInputStream.java @@ -443,7 +443,16 @@ public class BlockInputStream extends SeekableStream implements BAMInputStream { // } // } - return eof() ? -1 : length-remaining; + // If any data was copied into the buffer, return the amount of data copied. + if(remaining < length) + return length - remaining; + + // Otherwise, if at eof(), return -1. + else if(eof()) + return -1; + + // Otherwise, we must've hit a bug in the system. + throw new ReviewedStingException("BUG: read returned no data, but eof() reports false."); } public void close() { From 4ed06801a72508d64e54cfa59f46ee546b5c74e0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 2 Feb 2012 10:17:04 -0500 Subject: [PATCH 07/11] Updating HaplotypeCaller's HMM calc to use GOP as a function of the read instead of a function of the haplotype in preparation for IQSR --- .../gatk/walkers/recalibration/CountCovariatesWalker.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index bdf25419f..fdfb29da6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -357,11 +357,11 @@ public class CountCovariatesWalker extends LocusWalker 0 ) { From 3abfbcbcf2eb81394adf79072232967f8f6b069d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 3 Feb 2012 12:23:21 -0500 Subject: [PATCH 09/11] Generalized the TDT for multi-allelic events --- .../TransmissionDisequilibriumTest.java | 54 +++++++++++++------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index 43d5f0b28..34f4bd607 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -17,16 +18,13 @@ import java.util.*; /** * Created by IntelliJ IDEA. - * User: rpoplin, lfran + * User: rpoplin, lfran, ebanks * Date: 11/14/11 */ -public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation { +public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private Set trios = null; - private final static int REF = 0; - private final static int HET = 1; - private final static int HOM = 2; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { @@ -38,10 +36,10 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen } } - final Map toRet = new HashMap(1); + final Map toRet = new HashMap(1); final HashSet triosToTest = new HashSet(); - for( final Sample child : trios) { + for( final Sample child : trios ) { final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() && vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() && vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods(); @@ -65,28 +63,54 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen // Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT private double calculateTDT( final VariantContext vc, final Set triosToTest ) { - final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET); - final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET); - final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET); - final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET); - final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF); - final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF); + double nABGivenABandBB = 0.0; + double nBBGivenABandBB = 0.0; + double nAAGivenABandAB = 0.0; + double nBBGivenABandAB = 0.0; + double nAAGivenAAandAB = 0.0; + double nABGivenAAandAB = 0.0; + + // for each pair of alleles, add the likelihoods + int numAlleles = vc.getNAlleles(); + for ( int allele1 = 0; allele1 < numAlleles; allele1++ ) { + for ( int allele2 = allele1 + 1; allele2 < numAlleles; allele2++ ) { + + // TODO -- cache these for better performance + final int HOM1index = determineHomIndex(allele1, numAlleles); + final int HETindex = HOM1index + (allele2 - allele1); + final int HOM2index = determineHomIndex(allele2, numAlleles); + + nABGivenABandBB += calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HETindex, HOM2index, HETindex); + nBBGivenABandBB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HOM2index, HOM2index, HETindex); + nAAGivenABandAB += calculateNChildren(vc, triosToTest, HOM1index, HETindex, HETindex); + nBBGivenABandAB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HETindex); + nAAGivenAAandAB += calculateNChildren(vc, triosToTest, HOM1index, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HOM1index, HETindex, HOM1index); + nABGivenAAandAB += calculateNChildren(vc, triosToTest, HETindex, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM1index); + } + } final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB); final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB); return (numer * numer) / denom; } - private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) { + private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int momIdx, final int dadIdx ) { final double likelihoodVector[] = new double[triosToTest.size()]; int iii = 0; for( final Sample child : triosToTest ) { final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector(); final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector(); final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector(); - likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx]; + likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx]; } return MathUtils.sumLog10(likelihoodVector); } + + private static int determineHomIndex(final int alleleIndex, int numAlleles) { + int result = 0; + for ( int i = 0; i < alleleIndex; i++ ) + result += numAlleles--; + return result; + } }