Merge pull request #506 from broadinstitute/eb_quick_fixes_to_single_sample_pipeline

Several improvements to the single sample combining steps.
This commit is contained in:
Eric Banks 2014-02-12 11:30:48 -05:00
commit 6facf695ab
6 changed files with 62 additions and 37 deletions

View File

@ -54,6 +54,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
@ -86,7 +87,8 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
if ( genotypes == null || genotypes.size() == 0 )
return null;
int depth = 0;
int standardDepth = 0;
int ADrestrictedDepth = 0;
for ( final Genotype genotype : genotypes ) {
@ -101,39 +103,43 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
// TODO -- annotations must come afterwards (so that QD can use the AD).
if ( genotype.hasAD() ) {
final int[] AD = genotype.getAD();
int variantDepth = 0;
for ( int i = 1; i < AD.length; i++ )
variantDepth += AD[i];
if ( variantDepth <= 1 )
continue;
final int totalADdepth = (int)MathUtils.sum(AD);
if ( totalADdepth - AD[0] > 1 )
ADrestrictedDepth += totalADdepth;
standardDepth += totalADdepth;
continue;
}
if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) {
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null )
continue;
depth += context.getBasePileup().depthOfCoverage();
standardDepth += context.getBasePileup().depthOfCoverage();
} else if (perReadAlleleLikelihoodMap != null) {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName());
if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty())
continue;
depth += perReadAlleleLikelihoods.getNumberOfStoredElements();
standardDepth += perReadAlleleLikelihoods.getNumberOfStoredElements();
} else if ( genotype.hasDP() ) {
depth += genotype.getDP();
standardDepth += genotype.getDP();
}
}
if ( depth == 0 )
// if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward
if ( ADrestrictedDepth > 0 )
standardDepth = ADrestrictedDepth;
if ( standardDepth == 0 )
return null;
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
// Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
// full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
double QD = -10.0 * vc.getLog10PError() / ((double)depth * indelNormalizationFactor(altAlleleLength, ref != null));
// Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
// full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null));
// Hack: see note in the fixTooHighQD method below
// Hack: see note in the fixTooHighQD method below
QD = fixTooHighQD(QD);
final Map<String, Object> map = new HashMap<>();
@ -149,7 +155,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
* @return a possitive double
*/
private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) {
return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0);
return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0);
}
/**

View File

@ -205,12 +205,25 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
if ( VCs == null ) throw new IllegalArgumentException("The list of VariantContexts cannot be null");
for ( final VariantContext vc : VCs ) {
if ( vc.getEnd() == pos )
if ( isEndingContext(vc, pos) )
return true;
}
return false;
}
/**
* Does the given variant context end (in terms of reference blocks, not necessarily formally) at the given position.
* Note that for the purposes of this method/tool, deletions are considered to be single base events (as opposed to
* reference blocks), hence the check for the number of alleles (because we know there will always be a <NON_REF> allele).
*
* @param vc the variant context
* @param pos the position to query against
* @return true if this variant context "ends" at this position, false otherwise
*/
private boolean isEndingContext(final VariantContext vc, final int pos) {
return vc.getNAlleles() > 2 || vc.getEnd() == pos;
}
/**
* Disrupt the VariantContexts so that they all stop at the given pos, write them out, and put the remainder back in the list.
*
@ -228,10 +241,8 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
stoppedVCs.add(vc);
// if it was ending anyways, then remove it from the future state;
// note that for the purposes of this method, deletions are considered to be single base events (as opposed
// to ref blocks), hence the check for the number of alleles (because we know there will always be a <NON_REF> allele)
if ( vc.getNAlleles() > 2 || vc.getEnd() == pos )
// if it was ending anyways, then remove it from the future state
if ( isEndingContext(vc, pos) )
state.VCs.remove(i);
}
}

View File

@ -56,10 +56,8 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -154,10 +152,14 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
public void initialize() {
// create the annotation engine
annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.<String>emptyList(), this, getToolkit());
// take care of the VCF headers
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions());
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.MLE_ALLELE_COUNT_KEY, VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
if ( dbsnp != null && dbsnp.dbsnp.isBound() )
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);
@ -168,9 +170,6 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
// create the genotyping engine
genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), new UnifiedArgumentCollection(), logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
// create the annotation engine
annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.<String>emptyList(), this, getToolkit());
// collect the actual rod bindings into a list for use later
for ( final RodBindingCollection<VariantContext> variantCollection : variantCollections )
variants.addAll(variantCollection.getRodBindings());
@ -206,7 +205,13 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
final VariantContext regenotypedVC = genotypingEngine.calculateGenotypes(result);
if ( regenotypedVC == null )
return null;
result = new VariantContextBuilder(regenotypedVC).attributes(originalVC.getAttributes()).make();
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
final Map<String, Object> attrs = new HashMap<>(originalVC.getAttributes());
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make();
}
// if it turned monomorphic and we don't want such sites, quit

View File

@ -69,9 +69,6 @@ public class QualByDepthUnitTest extends WalkerTest {
final List<Allele> AA = Arrays.asList(A,A);
final List<Allele> AC = Arrays.asList(A,C);
final List<Allele> CC = Arrays.asList(C,C);
final List<Allele> AG = Arrays.asList(A,G);
final List<Allele> CG = Arrays.asList(C,G);
final List<Allele> GG = Arrays.asList(G,G);
final List<Allele> ACG = Arrays.asList(A,C,G);
@ -81,6 +78,7 @@ public class QualByDepthUnitTest extends WalkerTest {
final Genotype gGG = new GenotypeBuilder("4", GG).DP(10).AD(new int[]{1,9}).make();
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC)).make(), 5.0});
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gACerror)).make(), 5.0});
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAA, gAC)).make(), 5.0});
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, AC).log10PError(-5).genotypes(Arrays.asList(gAC, gACerror)).make(), 5.0});
tests.add(new Object[]{new VariantContextBuilder("test", "20", 10, 10, ACG).log10PError(-5).genotypes(Arrays.asList(gAA, gAC, gACerror, gGG)).make(), 2.5});

View File

@ -139,7 +139,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
final File gVCF = executeTest("testOneHasDeletionAndTwoHasRefBlock", spec).first.get(0);
final List<VariantContext> allVCs = GATKVCFUtils.readVCF(gVCF).getSecond();
Assert.assertEquals(allVCs.size(), 2);
Assert.assertEquals(allVCs.size(), 3);
final VariantContext first = allVCs.get(0);
Assert.assertEquals(first.getStart(), 69772);
@ -149,14 +149,19 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
final VariantContext second = allVCs.get(1);
Assert.assertEquals(second.getStart(), 69773);
Assert.assertEquals(second.getEnd(), 69783);
Assert.assertEquals(second.getEnd(), 69774);
Assert.assertEquals(second.getGenotypes().size(), 2);
final VariantContext third = allVCs.get(2);
Assert.assertEquals(third.getStart(), 69775);
Assert.assertEquals(third.getEnd(), 69783);
Assert.assertEquals(third.getGenotypes().size(), 2);
}
@Test
public void testMD5s() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("d2a5ca67a8ef6e27854e7a439883f05d"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("aecdfa9eb32b802cd629e9f811ef15fd"));
spec.disableShadowBCF();
executeTest("testMD5s", spec);
}

View File

@ -65,7 +65,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("54487ea151c49d36a15eac8097a7e460"));
Arrays.asList("ebda39d3343b34d21490a78284ed88e8"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -89,7 +89,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("9e6ef126d5e872e5b2a68c3d73471566"));
Arrays.asList("8eeda24a07f66d67b7639a31fda5c903"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}
@ -99,7 +99,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference +
" -V " + privateTestDir + "gvcfExample1.vcf",
1,
Arrays.asList("dd0e2846b3be9692ecb94f152b45c316"));
Arrays.asList("2541e164056d5632ad7de784a9af3880"));
executeTest("testJustOneSample", spec);
}
@ -110,7 +110,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V " + privateTestDir + "gvcfExample1.vcf" +
" -V " + privateTestDir + "gvcfExample2.vcf",
1,
Arrays.asList("a4f76a094af4708fc7f96a9b7a1b8726"));
Arrays.asList("9daf9602338db9d06c075c6e9a15ee2c"));
executeTest("testSamplesWithDifferentLs", spec);
}
}