Merge pull request #812 from broadinstitute/ldg_combineData_submit

New walker to combine WGS and WES data
This commit is contained in:
Valentin Ruano Rubio 2015-03-02 15:12:31 -05:00
commit f8f2680142
6 changed files with 127 additions and 27 deletions

View File

@ -92,6 +92,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
private Set<String> founderIds;
private int sampleCount;
private boolean pedigreeCheckWarningLogged = false;
private boolean didUniquifiedSampleNameCheck = false;
@Override
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
@ -104,6 +105,11 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
if(founderIds == null && walker != null) {
founderIds = ((Walker) walker).getSampleDB().getFounderIds();
}
//if none of the "founders" are in the vc samples, assume we uniquified the samples upstream and they are all founders
if (!didUniquifiedSampleNameCheck) {
checkSampleNames(vc);
didUniquifiedSampleNameCheck = true;
}
if ( founderIds == null || founderIds.isEmpty() ) {
if ( !pedigreeCheckWarningLogged ) {
logger.warn("Annotation will not be calculated, must provide a valid PED file (-ped) from the command line.");
@ -181,6 +187,22 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F));
}
//this method is intended to reconcile uniquified sample names
// it comes into play when calling this annotation from GenotypeGVCFs with --uniquifySamples because founderIds
// is derived from the sampleDB, which comes from the input sample names, but vc will have uniquified (i.e. different)
// sample names. Without this check, the founderIds won't be found in the vc and the annotation won't be calculated.
protected void checkSampleNames(final VariantContext vc) {
Set<String> vcSamples = new HashSet<>();
vcSamples.addAll(vc.getSampleNames());
if (!vcSamples.isEmpty()) {
if (founderIds!=null) {
vcSamples.removeAll(founderIds);
if (vcSamples.equals(vc.getSampleNames()))
founderIds = vc.getSampleNames();
}
}
}
@Override
public List<String> getKeyNames() { return Collections.singletonList(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY); }

View File

@ -293,7 +293,7 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
// we need the specialized merge if the site contains anything other than ref blocks
final VariantContext mergedVC;
if ( containsTrueAltAllele(stoppedVCs) )
mergedVC = ReferenceConfidenceVariantContextMerger.merge(stoppedVCs, gLoc, refBase, false);
mergedVC = ReferenceConfidenceVariantContextMerger.merge(stoppedVCs, gLoc, refBase, false, false);
else
mergedVC = referenceBlockMerge(stoppedVCs, state, pos.getStart());

View File

@ -60,6 +60,7 @@ import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.genotyper.SampleListUtils;
@ -137,7 +138,15 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
@Argument(fullName="includeNonVariantSites", shortName="allSites", doc="Include loci found to be non-variant after genotyping", required=false)
public boolean INCLUDE_NON_VARIANTS = false;
@ArgumentCollection
/**
* Uniquify all sample names (intended for use with multiple inputs for the same sample)
*/
@Hidden
@Advanced
@Argument(fullName="uniquifySamples", shortName="uniquifySamples", doc="Assume duplicate samples are present and uniquify all names with '.variant' and file number index")
public boolean uniquifySamples = false;
@ArgumentCollection
public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection();
/**
@ -166,13 +175,33 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
public void initialize() {
boolean inputsAreTagged = false;
// collect the actual rod bindings into a list for use later
for ( final RodBindingCollection<VariantContext> variantCollection : variantCollections )
for ( final RodBindingCollection<VariantContext> variantCollection : variantCollections ) {
variants.addAll(variantCollection.getRodBindings());
if (uniquifySamples) {
for (final RodBinding<VariantContext> rb : variantCollection.getRodBindings()) {
//are inputs passed in with -V:fileTag ?
if (!rb.getTags().isEmpty()) inputsAreTagged = true;
}
}
}
//RodBinding tags are used in sample uniquification
if (inputsAreTagged)
logger.warn("Output uniquified VCF may not be suitable for input to CombineSampleData because input VCF(s) contain tags.");
final GenomeAnalysisEngine toolkit = getToolkit();
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(toolkit, variants);
final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
final GATKVariantContextUtils.GenotypeMergeType mergeType;
if(uniquifySamples) {
mergeType = GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY;
}
else
mergeType = GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE;
final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, mergeType));
// create the genotyping engine
genotypingEngine = new UnifiedGenotypingEngine(createUAC(), samples, toolkit.getGenomeLocParser(), GeneralPloidyFailOverAFCalculatorProvider.createThreadSafeProvider(toolkit, genotypeArgs, logger),
toolkit.getArguments().BAQMode);
@ -201,7 +230,7 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
return null;
final GenomeLoc loc = ref.getLocus();
final VariantContext combinedVC = ReferenceConfidenceVariantContextMerger.merge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null, true);
final VariantContext combinedVC = ReferenceConfidenceVariantContextMerger.merge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null, true, uniquifySamples);
if ( combinedVC == null )
return null;
return regenotypeVC(tracker, ref, combinedVC);
@ -336,6 +365,10 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
return recoveredGs;
}
private void checkRODtags() {
}
/**
* Creates a UnifiedArgumentCollection with appropriate values filled in from the arguments in this walker
* @return a complete UnifiedArgumentCollection

View File

@ -83,9 +83,11 @@ public class ReferenceConfidenceVariantContextMerger {
* @param loc the current location
* @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case
* @param removeNonRefSymbolicAllele if true, remove the <NON_REF> allele from the merged VC
* @param samplesAreUniquified if true, sample names have been uniquified
* @return new VariantContext representing the merge of all VCs or null if it not relevant
*/
public static VariantContext merge(final List<VariantContext> VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele) {
public static VariantContext merge(final List<VariantContext> VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele,
final boolean samplesAreUniquified) {
// this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs
if ( VCs == null || VCs.size() == 0 )
return null;
@ -133,7 +135,7 @@ public class ReferenceConfidenceVariantContextMerger {
final VariantContext vc = pair.getFirst();
final List<Allele> remappedAlleles = pair.getSecond();
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList);
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified);
// special case DP (add it up) for all events
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) {
@ -303,15 +305,17 @@ public class ReferenceConfidenceVariantContextMerger {
* Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles.
* This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set.
*
* @param mergedGenotypes the genotypes context to add to
* @param VC the Variant Context for the sample
* @param remappedAlleles the list of remapped alleles for the sample
* @param targetAlleles the list of target alleles
* @param mergedGenotypes the genotypes context to add to
* @param VC the Variant Context for the sample
* @param remappedAlleles the list of remapped alleles for the sample
* @param targetAlleles the list of target alleles
* @param samplesAreUniquified true if sample names have been uniquified
*/
private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes,
final VariantContext VC,
final List<Allele> remappedAlleles,
final List<Allele> targetAlleles) {
final List<Allele> targetAlleles,
final boolean samplesAreUniquified) {
final int maximumPloidy = VC.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY);
// the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies)
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
@ -320,11 +324,14 @@ public class ReferenceConfidenceVariantContextMerger {
int[] perSampleIndexesOfRelevantAlleles;
for ( final Genotype g : VC.getGenotypes() ) {
final String name = g.getSampleName();
if ( mergedGenotypes.containsSample(name) )
continue;
final String name;
if (samplesAreUniquified)
name = g.getSampleName() + "." + VC.getSource();
else
name = g.getSampleName();
final int ploidy = g.getPloidy();
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
genotypeBuilder.name(name);
if (g.hasPL()) {
// lazy initialization of the genotype index map by ploidy.
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g);

View File

@ -261,6 +261,23 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
Assert.assertTrue(gVCFList.containsAll(outputVCFList));
}
@Test
public void testUniquifiedSamples() throws IOException {
//two copies of 5 samples; will also test InbreedingCoeff calculation for uniquified samples
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample1B " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample2B " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:combined1 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" +
" -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" +
" --uniquifySamples", b37KGReference),
1,
Arrays.asList("ef6a96d57434bb63935fa7d9f012da9f"));
executeTest("testUniquifiedSamples", spec);
}
/**
* Returns a list of attribute values from a VCF file
*

View File

@ -99,8 +99,9 @@ public class VariantContextMergerUnitTest extends BaseTest {
}
@Test(dataProvider = "referenceConfidenceMergeData")
public void testReferenceConfidenceMerge(final String testID, final List<VariantContext> toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) {
final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true);
public void testReferenceConfidenceMerge(final String testID, final List<VariantContext> toMerge, final GenomeLoc loc,
final boolean returnSiteEvenIfMonomorphic, final boolean uniquifySamples, final VariantContext expectedResult) {
final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true, uniquifySamples);
if ( result == null ) {
Assert.assertTrue(expectedResult == null);
return;
@ -171,6 +172,7 @@ public class VariantContextMergerUnitTest extends BaseTest {
final int start = 10;
final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start);
final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make();
final VariantContext VCbase2 = new VariantContextBuilder("test2", "20", start, start, Arrays.asList(Aref)).make();
final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make();
final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73};
@ -183,26 +185,34 @@ public class VariantContextMergerUnitTest extends BaseTest {
final List<Allele> A_ALT = Arrays.asList(Aref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make();
final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make();
final Allele AAref = Allele.create("AA", true);
final List<Allele> AA_ALT = Arrays.asList(AAref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make();
final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make();
final List<Allele> A_C = Arrays.asList(Aref, C);
final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make();
final List<Allele> A_C_ALT = Arrays.asList(Aref, C, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_C = new VariantContextBuilder(VCbase2).alleles(A_C_ALT).genotypes(gA_C).make();
final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make();
final List<Allele> A_G_ALT = Arrays.asList(Aref, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make();
final List<Allele> A_C_G = Arrays.asList(Aref, C, G);
final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make();
final List<Allele> A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make();
final VariantContext vcA_C_G = new VariantContextBuilder(VCbase2).alleles(A_C_G_ALT).genotypes(gA_C_G).make();
final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make();
final List<Allele> A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make();
final Allele A = Allele.create("A", false);
final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make();
@ -210,40 +220,40 @@ public class VariantContextMergerUnitTest extends BaseTest {
// first test the case of a single record
tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT),
loc, false,
loc, false, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()});
// now, test pairs:
// a SNP with another SNP
tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT),
loc, false,
loc, false, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()});
// a SNP with an indel
tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT),
loc, false,
loc, false, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()});
// a SNP with 2 SNPs
tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT),
loc, false,
loc, false, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()});
// a SNP with a ref record
tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT),
loc, false,
loc, false, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()});
// spanning records:
// a SNP with a spanning ref record
tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT),
loc, false,
loc, false, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()});
// a SNP with a spanning deletion
tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT),
loc, false,
loc, false, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()});
// combination of all
tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT),
loc, false,
loc, false, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(),
@ -255,12 +265,23 @@ public class VariantContextMergerUnitTest extends BaseTest {
// just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT),
loc, false,
loc, false, false,
null});
tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT),
loc, true,
loc, true, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()});
// test uniquification of sample names
tests.add(new Object[]{"test10",Arrays.asList(vcA_C, vcA_C_ALT), loc, false, true,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(
new GenotypeBuilder("A_C.test2").PL(new int[]{30, 20, 10}).alleles(noCalls).make(),
new GenotypeBuilder("A_C.test").PL(new int[]{30, 20, 10}).alleles(noCalls).make()).make()});
tests.add(new Object[]{"test11",Arrays.asList(vcA_C_G, vcA_C_G_ALT), loc, false, true,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(
new GenotypeBuilder("A_C_G.test2").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(),
new GenotypeBuilder("A_C_G.test").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make()).make()});
final Object[][] result = tests.toArray(new Object[][]{});
return result;
}