From 5b6d279a2e6ebe1c20dab124561307da890408a8 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 29 Jun 2011 10:21:27 -0400 Subject: [PATCH] Two bug fixes: a) Modified the way clipped bases are dealt with in ReadPosRankSumTest when annotating indels. Cigar string cannot be trusted because BWA can clip good high quality bases and some sites get incorrect ReadPos annotations if BWA systematically clips at an indel breakpoint. b) PL header needs to specify "." as length. Otherwise we fail VCF validation if multiallelic sites are present. --- .../walkers/annotator/ReadPosRankSumTest.java | 85 ++++++++++++++++++- .../indels/PairHMMIndelErrorModel.java | 8 +- .../sting/utils/codecs/vcf/VCFUtils.java | 2 +- 3 files changed, 86 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 2a17cd436..727904a3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -4,9 +4,11 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import net.sf.samtools.*; import java.util.Arrays; import java.util.HashMap; @@ -67,20 +69,95 @@ public class ReadPosRankSumTest extends RankSumTest { altLikelihood = like; } } - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0); - final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); + + int readPos = getOffsetFromClippedReadStart(p.getRead(), p.getOffset()); + final int numAlignedBases = getNumAlignedBases(p.getRead()); + + int rp = readPos; if( readPos > numAlignedBases / 2 ) { readPos = numAlignedBases - ( readPos + 1 ); } + //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); - if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) + + // if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads + // where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping. + // if (readPos < -1) + // return; + if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { refQuals.add((double)readPos); - else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) + //if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos); + } + else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { altQuals.add((double)readPos); + //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); + + } } } } + int getNumClippedBasesAtStart(SAMRecord read) { + // compute total number of clipped bases (soft or hard clipped) + // check for hard clips (never consider these bases): + final Cigar c = read.getCigar(); + final CigarElement first = c.getCigarElement(0); + + int numStartClippedBases = 0; + if (first.getOperator() == CigarOperator.H) { + numStartClippedBases = first.getLength(); + } + byte[] unclippedReadBases = read.getReadBases(); + byte[] unclippedReadQuals = read.getBaseQualities(); + + // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, + // and may leave a string of Q2 bases still hanging off the reads. + for (int i=numStartClippedBases; i < unclippedReadBases.length; i++) { + if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) + numStartClippedBases++; + else + break; + + } + + return numStartClippedBases; + } + + int getNumAlignedBases(SAMRecord read) { + return read.getReadLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read); + } + + int getNumClippedBasesAtEnd(SAMRecord read) { + // compute total number of clipped bases (soft or hard clipped) + // check for hard clips (never consider these bases): + final Cigar c = read.getCigar(); + CigarElement last = c.getCigarElement(c.numCigarElements()-1); + + int numEndClippedBases = 0; + if (last.getOperator() == CigarOperator.H) { + numEndClippedBases = last.getLength(); + } + byte[] unclippedReadBases = read.getReadBases(); + byte[] unclippedReadQuals = read.getBaseQualities(); + + // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, + // and may leave a string of Q2 bases still hanging off the reads. + for (int i=unclippedReadBases.length-numEndClippedBases-1; i >= 0; i-- ){ + if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) + numEndClippedBases++; + else + break; + } + + + return numEndClippedBases; + } + + int getOffsetFromClippedReadStart(SAMRecord read, int offset) { + + + return offset - getNumClippedBasesAtStart(read); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index e7b9cfa68..ab7ae4184 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -32,9 +32,9 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; /*import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate; -import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDataManager; -import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDatum; -import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalibrationArgumentCollection; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection; */import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; @@ -57,7 +57,7 @@ import java.util.regex.Pattern; public class PairHMMIndelErrorModel { - private final int BASE_QUAL_THRESHOLD = 10; + public static final int BASE_QUAL_THRESHOLD = 20; private static final int MATCH_OFFSET = 0; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index e919d5c25..ecede068e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -190,7 +190,7 @@ public class VCFUtils { result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, -1, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; if site is not biallelic, number of likelihoods if n*(n+1)/2")); return result; }