Two bug fixes:

a) Modified the way clipped bases are dealt with in ReadPosRankSumTest when annotating indels. Cigar string cannot be trusted because BWA can clip good high quality bases and some sites get incorrect ReadPos annotations if BWA systematically clips at an indel breakpoint.
b) PL header needs to specify "." as length. Otherwise we fail VCF validation if multiallelic sites are present.
This commit is contained in:
Guillermo del Angel 2011-06-29 10:21:27 -04:00
parent 643458d7db
commit 5b6d279a2e
3 changed files with 86 additions and 9 deletions

View File

@ -4,9 +4,11 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import net.sf.samtools.*;
import java.util.Arrays;
import java.util.HashMap;
@ -67,20 +69,95 @@ public class ReadPosRankSumTest extends RankSumTest {
altLikelihood = like;
}
}
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p.getOffset(), 0, 0);
final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead());
int readPos = getOffsetFromClippedReadStart(p.getRead(), p.getOffset());
final int numAlignedBases = getNumAlignedBases(p.getRead());
int rp = readPos;
if( readPos > numAlignedBases / 2 ) {
readPos = numAlignedBases - ( readPos + 1 );
}
//if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases);
if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH))
// if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads
// where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping.
// if (readPos < -1)
// return;
if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) {
refQuals.add((double)readPos);
else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH))
//if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos);
}
else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) {
altQuals.add((double)readPos);
//if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos);
}
}
}
}
int getNumClippedBasesAtStart(SAMRecord read) {
// compute total number of clipped bases (soft or hard clipped)
// check for hard clips (never consider these bases):
final Cigar c = read.getCigar();
final CigarElement first = c.getCigarElement(0);
int numStartClippedBases = 0;
if (first.getOperator() == CigarOperator.H) {
numStartClippedBases = first.getLength();
}
byte[] unclippedReadBases = read.getReadBases();
byte[] unclippedReadQuals = read.getBaseQualities();
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
for (int i=numStartClippedBases; i < unclippedReadBases.length; i++) {
if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
numStartClippedBases++;
else
break;
}
return numStartClippedBases;
}
int getNumAlignedBases(SAMRecord read) {
return read.getReadLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
}
int getNumClippedBasesAtEnd(SAMRecord read) {
// compute total number of clipped bases (soft or hard clipped)
// check for hard clips (never consider these bases):
final Cigar c = read.getCigar();
CigarElement last = c.getCigarElement(c.numCigarElements()-1);
int numEndClippedBases = 0;
if (last.getOperator() == CigarOperator.H) {
numEndClippedBases = last.getLength();
}
byte[] unclippedReadBases = read.getReadBases();
byte[] unclippedReadQuals = read.getBaseQualities();
// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
for (int i=unclippedReadBases.length-numEndClippedBases-1; i >= 0; i-- ){
if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
numEndClippedBases++;
else
break;
}
return numEndClippedBases;
}
int getOffsetFromClippedReadStart(SAMRecord read, int offset) {
return offset - getNumClippedBasesAtStart(read);
}
}

View File

@ -32,9 +32,9 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
/*import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate;
import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDataManager;
import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalDatum;
import org.broadinstitute.sting.walkers.IndelCountCovariates.RecalibrationArgumentCollection;
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager;
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum;
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection;
*/import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
@ -57,7 +57,7 @@ import java.util.regex.Pattern;
public class PairHMMIndelErrorModel {
private final int BASE_QUAL_THRESHOLD = 10;
public static final int BASE_QUAL_THRESHOLD = 20;
private static final int MATCH_OFFSET = 0;

View File

@ -190,7 +190,7 @@ public class VCFUtils {
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality"));
result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)"));
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, -1, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; if site is not biallelic, number of likelihoods if n*(n+1)/2"));
return result;
}