diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 2a17cd436..727904a3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -4,9 +4,11 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import net.sf.samtools.*; import java.util.Arrays; import java.util.HashMap; @@ -67,20 +69,95 @@ public class ReadPosRankSumTest extends RankSumTest { altLikelihood = like; } } - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0); - final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); + + int readPos = getOffsetFromClippedReadStart(p.getRead(), p.getOffset()); + final int numAlignedBases = getNumAlignedBases(p.getRead()); + + int rp = readPos; if( readPos > numAlignedBases / 2 ) { readPos = numAlignedBases - ( readPos + 1 ); } + //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); - if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) + + // if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads + // where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping. + // if (readPos < -1) + // return; + if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { refQuals.add((double)readPos); - else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) + //if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos); + } + else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { altQuals.add((double)readPos); + //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); + + } } } } + int getNumClippedBasesAtStart(SAMRecord read) { + // compute total number of clipped bases (soft or hard clipped) + // check for hard clips (never consider these bases): + final Cigar c = read.getCigar(); + final CigarElement first = c.getCigarElement(0); + + int numStartClippedBases = 0; + if (first.getOperator() == CigarOperator.H) { + numStartClippedBases = first.getLength(); + } + byte[] unclippedReadBases = read.getReadBases(); + byte[] unclippedReadQuals = read.getBaseQualities(); + + // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, + // and may leave a string of Q2 bases still hanging off the reads. + for (int i=numStartClippedBases; i < unclippedReadBases.length; i++) { + if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) + numStartClippedBases++; + else + break; + + } + + return numStartClippedBases; + } + + int getNumAlignedBases(SAMRecord read) { + return read.getReadLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read); + } + + int getNumClippedBasesAtEnd(SAMRecord read) { + // compute total number of clipped bases (soft or hard clipped) + // check for hard clips (never consider these bases): + final Cigar c = read.getCigar(); + CigarElement last = c.getCigarElement(c.numCigarElements()-1); + + int numEndClippedBases = 0; + if (last.getOperator() == CigarOperator.H) { + numEndClippedBases = last.getLength(); + } + byte[] unclippedReadBases = read.getReadBases(); + byte[] unclippedReadQuals = read.getBaseQualities(); + + // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, + // and may leave a string of Q2 bases still hanging off the reads. + for (int i=unclippedReadBases.length-numEndClippedBases-1; i >= 0; i-- ){ + if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD) + numEndClippedBases++; + else + break; + } + + + return numEndClippedBases; + } + + int getOffsetFromClippedReadStart(SAMRecord read, int offset) { + + + return offset - getNumClippedBasesAtStart(read); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index e7b9cfa68..ab7ae4184 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -32,9 +32,9 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; /*import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate; -import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDataManager; -import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDatum; -import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalibrationArgumentCollection; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection; */import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; @@ -57,7 +57,7 @@ import java.util.regex.Pattern; public class PairHMMIndelErrorModel { - private final int BASE_QUAL_THRESHOLD = 10; + public static final int BASE_QUAL_THRESHOLD = 20; private static final int MATCH_OFFSET = 0; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index e919d5c25..ecede068e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -190,7 +190,7 @@ public class VCFUtils { result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, -1, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; if site is not biallelic, number of likelihoods if n*(n+1)/2")); return result; }