Two bug fixes:
a) Modified the way clipped bases are dealt with in ReadPosRankSumTest when annotating indels. Cigar string cannot be trusted because BWA can clip good high quality bases and some sites get incorrect ReadPos annotations if BWA systematically clips at an indel breakpoint. b) PL header needs to specify "." as length. Otherwise we fail VCF validation if multiallelic sites are present.
This commit is contained in:
parent
643458d7db
commit
5b6d279a2e
|
|
@ -4,9 +4,11 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import net.sf.samtools.*;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
|
@ -67,20 +69,95 @@ public class ReadPosRankSumTest extends RankSumTest {
|
|||
altLikelihood = like;
|
||||
}
|
||||
}
|
||||
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0);
|
||||
final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead());
|
||||
|
||||
int readPos = getOffsetFromClippedReadStart(p.getRead(), p.getOffset());
|
||||
final int numAlignedBases = getNumAlignedBases(p.getRead());
|
||||
|
||||
int rp = readPos;
|
||||
if( readPos > numAlignedBases / 2 ) {
|
||||
readPos = numAlignedBases - ( readPos + 1 );
|
||||
}
|
||||
//if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases);
|
||||
|
||||
if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH))
|
||||
|
||||
// if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads
|
||||
// where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping.
|
||||
// if (readPos < -1)
|
||||
// return;
|
||||
if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) {
|
||||
refQuals.add((double)readPos);
|
||||
else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH))
|
||||
//if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos);
|
||||
}
|
||||
else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) {
|
||||
altQuals.add((double)readPos);
|
||||
//if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos);
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int getNumClippedBasesAtStart(SAMRecord read) {
|
||||
// compute total number of clipped bases (soft or hard clipped)
|
||||
// check for hard clips (never consider these bases):
|
||||
final Cigar c = read.getCigar();
|
||||
final CigarElement first = c.getCigarElement(0);
|
||||
|
||||
int numStartClippedBases = 0;
|
||||
if (first.getOperator() == CigarOperator.H) {
|
||||
numStartClippedBases = first.getLength();
|
||||
}
|
||||
byte[] unclippedReadBases = read.getReadBases();
|
||||
byte[] unclippedReadQuals = read.getBaseQualities();
|
||||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
for (int i=numStartClippedBases; i < unclippedReadBases.length; i++) {
|
||||
if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
|
||||
numStartClippedBases++;
|
||||
else
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return numStartClippedBases;
|
||||
}
|
||||
|
||||
int getNumAlignedBases(SAMRecord read) {
|
||||
return read.getReadLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
|
||||
}
|
||||
|
||||
int getNumClippedBasesAtEnd(SAMRecord read) {
|
||||
// compute total number of clipped bases (soft or hard clipped)
|
||||
// check for hard clips (never consider these bases):
|
||||
final Cigar c = read.getCigar();
|
||||
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
|
||||
|
||||
int numEndClippedBases = 0;
|
||||
if (last.getOperator() == CigarOperator.H) {
|
||||
numEndClippedBases = last.getLength();
|
||||
}
|
||||
byte[] unclippedReadBases = read.getReadBases();
|
||||
byte[] unclippedReadQuals = read.getBaseQualities();
|
||||
|
||||
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
|
||||
// and may leave a string of Q2 bases still hanging off the reads.
|
||||
for (int i=unclippedReadBases.length-numEndClippedBases-1; i >= 0; i-- ){
|
||||
if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
|
||||
numEndClippedBases++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
return numEndClippedBases;
|
||||
}
|
||||
|
||||
int getOffsetFromClippedReadStart(SAMRecord read, int offset) {
|
||||
|
||||
|
||||
return offset - getNumClippedBasesAtStart(read);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,9 +32,9 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
/*import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate;
|
||||
import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDataManager;
|
||||
import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDatum;
|
||||
import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection;
|
||||
*/import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
|
|
@ -57,7 +57,7 @@ import java.util.regex.Pattern;
|
|||
public class PairHMMIndelErrorModel {
|
||||
|
||||
|
||||
private final int BASE_QUAL_THRESHOLD = 10;
|
||||
public static final int BASE_QUAL_THRESHOLD = 20;
|
||||
|
||||
|
||||
private static final int MATCH_OFFSET = 0;
|
||||
|
|
|
|||
|
|
@ -190,7 +190,7 @@ public class VCFUtils {
|
|||
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, -1, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; if site is not biallelic, number of likelihoods if n*(n+1)/2"));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue