Several improvements to the single sample combining steps.
1. updated QualByDepth not to use AD-restricted depth if it is zero. Added unit test this change. 2. Fixed small bug in CombineGVCFs where spanning deletions were not being treated consistently throughout. Added test for this situation. 3. Make sure GenotypeGVCFs puts in the required headers. Updated test files to make sure this is covered. 4. Have GenotypeGVCFs propagate up the MLEAC/AF (which were getting clobbered out). Tests updated to account for this.
This commit is contained in:
parent
3cfcfa4fa0
commit
300b474c96
|
|
@ -54,6 +54,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
|
|
@ -86,7 +87,8 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
return null;
|
||||
|
||||
int depth = 0;
|
||||
int standardDepth = 0;
|
||||
int ADrestrictedDepth = 0;
|
||||
|
||||
for ( final Genotype genotype : genotypes ) {
|
||||
|
||||
|
|
@ -101,39 +103,43 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
// TODO -- annotations must come afterwards (so that QD can use the AD).
|
||||
if ( genotype.hasAD() ) {
|
||||
final int[] AD = genotype.getAD();
|
||||
int variantDepth = 0;
|
||||
for ( int i = 1; i < AD.length; i++ )
|
||||
variantDepth += AD[i];
|
||||
if ( variantDepth <= 1 )
|
||||
continue;
|
||||
final int totalADdepth = (int)MathUtils.sum(AD);
|
||||
if ( totalADdepth - AD[0] > 1 )
|
||||
ADrestrictedDepth += totalADdepth;
|
||||
standardDepth += totalADdepth;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null )
|
||||
continue;
|
||||
depth += context.getBasePileup().depthOfCoverage();
|
||||
standardDepth += context.getBasePileup().depthOfCoverage();
|
||||
|
||||
} else if (perReadAlleleLikelihoodMap != null) {
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName());
|
||||
if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty())
|
||||
continue;
|
||||
|
||||
depth += perReadAlleleLikelihoods.getNumberOfStoredElements();
|
||||
standardDepth += perReadAlleleLikelihoods.getNumberOfStoredElements();
|
||||
} else if ( genotype.hasDP() ) {
|
||||
depth += genotype.getDP();
|
||||
standardDepth += genotype.getDP();
|
||||
}
|
||||
}
|
||||
|
||||
if ( depth == 0 )
|
||||
// if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward
|
||||
if ( ADrestrictedDepth > 0 )
|
||||
standardDepth = ADrestrictedDepth;
|
||||
|
||||
if ( standardDepth == 0 )
|
||||
return null;
|
||||
|
||||
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
|
||||
// Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
|
||||
// full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
|
||||
double QD = -10.0 * vc.getLog10PError() / ((double)depth * indelNormalizationFactor(altAlleleLength, ref != null));
|
||||
// Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
|
||||
// full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
|
||||
double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null));
|
||||
|
||||
// Hack: see note in the fixTooHighQD method below
|
||||
// Hack: see note in the fixTooHighQD method below
|
||||
QD = fixTooHighQD(QD);
|
||||
|
||||
final Map<String, Object> map = new HashMap<>();
|
||||
|
|
@ -149,7 +155,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
* @return a possitive double
|
||||
*/
|
||||
private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) {
|
||||
return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0);
|
||||
return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -205,12 +205,25 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
if ( VCs == null ) throw new IllegalArgumentException("The list of VariantContexts cannot be null");
|
||||
|
||||
for ( final VariantContext vc : VCs ) {
|
||||
if ( vc.getEnd() == pos )
|
||||
if ( isEndingContext(vc, pos) )
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the given variant context end (in terms of reference blocks, not necessarily formally) at the given position.
|
||||
* Note that for the purposes of this method/tool, deletions are considered to be single base events (as opposed to
|
||||
* reference blocks), hence the check for the number of alleles (because we know there will always be a <NON_REF> allele).
|
||||
*
|
||||
* @param vc the variant context
|
||||
* @param pos the position to query against
|
||||
* @return true if this variant context "ends" at this position, false otherwise
|
||||
*/
|
||||
private boolean isEndingContext(final VariantContext vc, final int pos) {
|
||||
return vc.getNAlleles() > 2 || vc.getEnd() == pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Disrupt the VariantContexts so that they all stop at the given pos, write them out, and put the remainder back in the list.
|
||||
*
|
||||
|
|
@ -228,10 +241,8 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
|
|||
|
||||
stoppedVCs.add(vc);
|
||||
|
||||
// if it was ending anyways, then remove it from the future state;
|
||||
// note that for the purposes of this method, deletions are considered to be single base events (as opposed
|
||||
// to ref blocks), hence the check for the number of alleles (because we know there will always be a <NON_REF> allele)
|
||||
if ( vc.getNAlleles() > 2 || vc.getEnd() == pos )
|
||||
// if it was ending anyways, then remove it from the future state
|
||||
if ( isEndingContext(vc, pos) )
|
||||
state.VCs.remove(i);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,10 +56,8 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
|
|||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
@ -154,10 +152,14 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
|||
|
||||
|
||||
public void initialize() {
|
||||
// create the annotation engine
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.<String>emptyList(), this, getToolkit());
|
||||
|
||||
// take care of the VCF headers
|
||||
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());
|
||||
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
|
||||
headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
|
||||
headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.MLE_ALLELE_COUNT_KEY, VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
|
||||
if ( dbsnp != null && dbsnp.dbsnp.isBound() )
|
||||
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);
|
||||
|
||||
|
|
@ -168,9 +170,6 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
|||
// create the genotyping engine
|
||||
genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), new UnifiedArgumentCollection(), logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// create the annotation engine
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.<String>emptyList(), this, getToolkit());
|
||||
|
||||
// collect the actual rod bindings into a list for use later
|
||||
for ( final RodBindingCollection<VariantContext> variantCollection : variantCollections )
|
||||
variants.addAll(variantCollection.getRodBindings());
|
||||
|
|
@ -206,7 +205,13 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
|||
final VariantContext regenotypedVC = genotypingEngine.calculateGenotypes(result);
|
||||
if ( regenotypedVC == null )
|
||||
return null;
|
||||
result = new VariantContextBuilder(regenotypedVC).attributes(originalVC.getAttributes()).make();
|
||||
|
||||
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
|
||||
final Map<String, Object> attrs = new HashMap<>(originalVC.getAttributes());
|
||||
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
|
||||
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
|
||||
|
||||
result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make();
|
||||
}
|
||||
|
||||
// if it turned monomorphic and we don't want such sites, quit
|
||||
|
|
|
|||
|
|
@ -69,9 +69,6 @@ public class QualByDepthUnitTest extends WalkerTest {
|
|||
|
||||
final List<Allele> AA = Arrays.asList(A,A);
|
||||
final List<Allele> AC = Arrays.asList(A,C);
|
||||
final List<Allele> CC = Arrays.asList(C,C);
|
||||
final List<Allele> AG = Arrays.asList(A,G);
|
||||
final List<Allele> CG = Arrays.asList(C,G);
|
||||
final List<Allele> GG = Arrays.asList(G,G);
|
||||
final List<Allele> ACG = Arrays.asList(A,C,G);
|
||||
|
||||
|
|
@ -81,6 +78,7 @@ public class QualByDepthUnitTest extends WalkerTest {
|
|||
final Genotype gGG = new GenotypeBuilder("4", GG).DP(10).AD(new int[]{1,9}).make();
|
||||
|
||||
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC)).make(), 5.0});
|
||||
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gACerror)).make(), 5.0});
|
||||
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAA, gAC)).make(), 5.0});
|
||||
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC, gACerror)).make(), 5.0});
|
||||
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, ACG).log10PError(-5).genotypes(Arrays.asList(gAA, gAC, gACerror, gGG)).make(), 2.5});
|
||||
|
|
|
|||
|
|
@ -139,7 +139,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
|
|||
final File gVCF = executeTest("testOneHasDeletionAndTwoHasRefBlock", spec).first.get(0);
|
||||
final List<VariantContext> allVCs = GATKVCFUtils.readVCF(gVCF).getSecond();
|
||||
|
||||
Assert.assertEquals(allVCs.size(), 2);
|
||||
Assert.assertEquals(allVCs.size(), 3);
|
||||
|
||||
final VariantContext first = allVCs.get(0);
|
||||
Assert.assertEquals(first.getStart(), 69772);
|
||||
|
|
@ -149,14 +149,19 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
|
|||
|
||||
final VariantContext second = allVCs.get(1);
|
||||
Assert.assertEquals(second.getStart(), 69773);
|
||||
Assert.assertEquals(second.getEnd(), 69783);
|
||||
Assert.assertEquals(second.getEnd(), 69774);
|
||||
Assert.assertEquals(second.getGenotypes().size(), 2);
|
||||
|
||||
final VariantContext third = allVCs.get(2);
|
||||
Assert.assertEquals(third.getStart(), 69775);
|
||||
Assert.assertEquals(third.getEnd(), 69783);
|
||||
Assert.assertEquals(third.getGenotypes().size(), 2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMD5s() throws Exception {
|
||||
final String cmd = baseTestString(" -L 1:69485-69791");
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("d2a5ca67a8ef6e27854e7a439883f05d"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("aecdfa9eb32b802cd629e9f811ef15fd"));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testMD5s", spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
|
|||
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
|
||||
" -L 20:10,000,000-20,000,000", b37KGReference),
|
||||
1,
|
||||
Arrays.asList("54487ea151c49d36a15eac8097a7e460"));
|
||||
Arrays.asList("ebda39d3343b34d21490a78284ed88e8"));
|
||||
executeTest("combineSingleSamplePipelineGVCF", spec);
|
||||
}
|
||||
|
||||
|
|
@ -89,7 +89,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
|
|||
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
|
||||
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
|
||||
1,
|
||||
Arrays.asList("9e6ef126d5e872e5b2a68c3d73471566"));
|
||||
Arrays.asList("8eeda24a07f66d67b7639a31fda5c903"));
|
||||
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
|
||||
}
|
||||
|
||||
|
|
@ -99,7 +99,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
|
|||
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference +
|
||||
" -V " + privateTestDir + "gvcfExample1.vcf",
|
||||
1,
|
||||
Arrays.asList("dd0e2846b3be9692ecb94f152b45c316"));
|
||||
Arrays.asList("2541e164056d5632ad7de784a9af3880"));
|
||||
executeTest("testJustOneSample", spec);
|
||||
}
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
|
|||
" -V " + privateTestDir + "gvcfExample1.vcf" +
|
||||
" -V " + privateTestDir + "gvcfExample2.vcf",
|
||||
1,
|
||||
Arrays.asList("a4f76a094af4708fc7f96a9b7a1b8726"));
|
||||
Arrays.asList("9daf9602338db9d06c075c6e9a15ee2c"));
|
||||
executeTest("testSamplesWithDifferentLs", spec);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue