Added a fix for combining/genotyping positions over spanning deletions.

Previously, if a SNP occurred in sample A at a position that was in the middle of a deletion for sample B,
sample B would be genotyped as homozygous reference there (but it's NOT reference - there's a deletion).
Now, sample B is genotyped as having a symbolic DEL allele.

Minor cleanup added.  Note that I also removed Laura's previous fix for this problem.

Existing integration tests change because I've added a new header line to the VCF being output.
I also added several tests for the new functionality showing:
1. genotyping from separate and already combined gvcfs give the same output
2. genotyping over multiple spanning deletions works
3. combining works too

Existing unit tests also cover this case.
This commit is contained in:
Eric Banks 2015-04-15 15:12:41 -04:00
parent 6adf91d49f
commit 530e0e5ea6
8 changed files with 288 additions and 92 deletions

View File

@ -88,9 +88,6 @@ import java.util.*;
*/
public class ReferenceConfidenceModel {
//public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now
public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele
private final GenomeLocParser genomeLocParser;
private final SampleList samples;
@ -138,9 +135,7 @@ public class ReferenceConfidenceModel {
*/
public Set<VCFHeaderLine> getVCFHeaderLines() {
final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>();
// TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles?
headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
//headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
return headerLines;
}

View File

@ -164,6 +164,8 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
// take care of the VCF headers
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location"));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags
final Set<String> samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
final VCFHeader vcfHeader = new VCFHeader(headerLines, samples);

View File

@ -211,10 +211,13 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions());
headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
// add the pool values for each genotype
// add headers for annotations added by this tool
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location"));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags
if ( dbsnp != null && dbsnp.dbsnp.isBound() )
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);

View File

@ -113,20 +113,27 @@ public class ReferenceConfidenceVariantContextMerger {
final Map<String, List<Comparable>> annotationMap = new LinkedHashMap<>();
final GenotypesContext genotypes = GenotypesContext.create();
final int variantContextCount = VCs.size();
// In this list we hold the mapping of each variant context alleles.
final List<Pair<VariantContext,List<Allele>>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount);
final List<Pair<VariantContext,List<Allele>>> vcAndNewAllelePairs = new ArrayList<>(VCs.size());
// Keep track of whether we saw a spanning deletion and a non-spanning event
boolean sawSpanningDeletion = false;
boolean sawNonSpanningEvent = false;
// cycle through and add info from the other VCs
for ( final VariantContext vc : VCs ) {
// if this context doesn't start at the current location then it must be a spanning event (deletion or ref block)
final boolean isSpanningEvent = loc.getStart() != vc.getStart();
// record whether it's also a spanning deletion/event (we know this because the VariantContext type is no
// longer "symbolic" but "mixed" because there are real alleles mixed in with the symbolic non-ref allele)
sawSpanningDeletion |= ( isSpanningEvent && vc.isMixed() ) || vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE);
sawNonSpanningEvent |= ( !isSpanningEvent && vc.isMixed() );
vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles())
: remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet)));
vcAndNewAllelePairs.add(new Pair<>(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet)));
}
// Add <NON_REF> to the end if at all required in in the output.
// Add <DEL> and <NON_REF> to the end if at all required in in the output.
if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE);
if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
@ -171,13 +178,27 @@ public class ReferenceConfidenceVariantContextMerger {
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
// note that in order to calculate the end position, we need a list of alleles that doesn't include anything symbolic
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList)
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart())
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(nonSymbolicAlleles(allelesList), loc.getStart(), loc.getStart())
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later
return builder.make();
}
/**
* @param list the original alleles list
* @return a non-null list of non-symbolic alleles
*/
private static List<Allele> nonSymbolicAlleles(final List<Allele> list) {
final List<Allele> result = new ArrayList<>(list.size());
for ( final Allele allele : list ) {
if ( !allele.isSymbolic() )
result.add(allele);
}
return result;
}
/**
* Determines the ref allele given the provided reference base at this position
*
@ -246,27 +267,28 @@ public class ReferenceConfidenceVariantContextMerger {
* collects alternative alleles present in variant context and add them to the {@code finalAlleles} set.
* </li></ul>
*
* @param vcAlleles the variant context allele list.
* @param refAllele final reference allele.
* @param vc the variant context.
* @param refAllele final reference allele.
* @param finalAlleles where to add the final set of non-ref called alleles.
* @return never {@code null}
*/
//TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}.
private static List<Allele> remapAlleles(final List<Allele> vcAlleles, final Allele refAllele, final LinkedHashSet<Allele> finalAlleles) {
final Allele vcRef = vcAlleles.get(0);
if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference");
private static List<Allele> remapAlleles(final VariantContext vc, final Allele refAllele, final LinkedHashSet<Allele> finalAlleles) {
final Allele vcRef = vc.getReference();
final byte[] refBases = refAllele.getBases();
final int extraBaseCount = refBases.length - vcRef.getBases().length;
if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected");
final List<Allele> result = new ArrayList<>(vcAlleles.size());
for (final Allele a : vcAlleles) {
if (a.isReference()) {
result.add(refAllele);
} else if (a.isSymbolic()) {
final List<Allele> result = new ArrayList<>(vc.getNAlleles());
result.add(refAllele);
for (final Allele a : vc.getAlternateAlleles()) {
if (a.isSymbolic()) {
result.add(a);
// we always skip <NON_REF> when adding to finalAlleles this is done outside if applies.
if (!a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE))
// we always skip <NON_REF> when adding to finalAlleles; this is done outside if applies.
// we also skip <*DEL> if there isn't a real alternate allele.
if ( !a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && !vc.isSymbolic() )
finalAlleles.add(a);
} else if (a.isCalled()) {
final Allele newAllele;
@ -287,17 +309,31 @@ public class ReferenceConfidenceVariantContextMerger {
}
/**
* Replaces any alleles in the list with NO CALLS, except for the generic ALT allele
* Replaces any alleles in the VariantContext with NO CALLS or the symbolic deletion allele as appropriate, except for the generic ALT allele
*
* @param alleles list of alleles to replace
* @param vc VariantContext with the alleles to replace
* @return non-null list of alleles
*/
private static List<Allele> replaceWithNoCalls(final List<Allele> alleles) {
if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null");
private static List<Allele> replaceWithNoCallsAndDels(final VariantContext vc) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
final List<Allele> result = new ArrayList<>(alleles.size());
for ( final Allele allele : alleles )
result.add(allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL);
final List<Allele> result = new ArrayList<>(vc.getNAlleles());
// no-call the reference allele
result.add(Allele.NO_CALL);
// handle the alternate alleles
for ( final Allele allele : vc.getAlternateAlleles() ) {
final Allele replacement;
if ( allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) )
replacement = allele;
else if ( allele.length() < vc.getReference().length() )
replacement = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE;
else
replacement = Allele.NO_CALL;
result.add(replacement);
}
return result;
}
@ -387,43 +423,16 @@ public class ReferenceConfidenceVariantContextMerger {
throw new UserException("The list of input alleles must contain " + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");
final int indexOfNonRef = remappedAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
//if the refs don't match then let the non-ref allele be the most likely of the alts
//TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref
final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false);
final int indexOfBestAlt;
if (!refsMatch && g.hasPL()) {
final int[] PLs = g.getPL().clone();
PLs[0] = Integer.MAX_VALUE; //don't pick 0/0
final int indexOfBestAltPL = MathUtils.minElementIndex(PLs);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL);
indexOfBestAlt = pair.alleleIndex2;
}
else
indexOfBestAlt = indexOfNonRef;
final int[] indexMapping = new int[targetAlleles.size()];
// the reference likelihoods should always map to each other (even if the alleles don't)
indexMapping[0] = 0;
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
final int targetNonRef = targetAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final boolean targetHasNonRef = targetNonRef != -1;
final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1;
for ( int i = 1; i <= lastConcreteAlt; i++ ) {
// create the index mapping, using the <NON-REF> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
if (targetHasNonRef) {
if (refsMatch) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef));
indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
else {
indexMapping[targetNonRef] = indexOfBestAlt;
}
}
return indexMapping;
}

View File

@ -100,7 +100,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals",
1,
Arrays.asList("7b3153135e4f8e1d137d3f4beb46f182"));
Arrays.asList("ebe26077809961f53d5244643d24fd45"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -112,7 +112,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals",
1,
Arrays.asList("4f546634213ece6f08ec9258620b92bb"));
Arrays.asList("2d36a5f996cad47e5d05fcd78f6e572e"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -190,7 +190,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testMD5s() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b7c753452ab0c05f9cee538e420b87fa"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("83ea9f4a9aadb1218c21c9d3780e8009"));
spec.disableShadowBCF();
executeTest("testMD5s", spec);
}
@ -198,7 +198,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBasepairResolutionOutput() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --convertToBasePairResolution");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("bb6420ead95da4c72e76ca4bf5860ef0"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f153cb6e986efc9b50f0b8833fe5d3da"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionOutput", spec);
}
@ -206,16 +206,27 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBreakBlocks() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("dd31182124c4b78a8a03edb1e0cf618b"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6626ff272e7e76fba091f5bde4a1f963"));
spec.disableShadowBCF();
executeTest("testBreakBlocks", spec);
}
@Test
public void testSpanningDeletions() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf",
1,
Arrays.asList("fba48ce2bf8761366ff2cd0b45d0421f"));
spec.disableShadowBCF();
executeTest("testSpanningDeletions", spec);
}
@Test
public void testWrongReferenceBaseBugFix() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf"
+ " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input2.vcf") + " -o %s --no_cmdline_in_header");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("75d81c247bef5e394b4a2d4126aee2b3"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("331c1a4a6a72ea1617c1697a5d945d56"));
spec.disableShadowBCF();
executeTest("testWrongReferenceBaseBugFix",spec);
@ -224,7 +235,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBasepairResolutionInput() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("994ff5c219c683635eadc1624cbbda74"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("207e89b5677fbf0ef4d1ff768262cf0c"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionInput", spec);
}

View File

@ -82,7 +82,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
1,
Arrays.asList("23ff3e22262929138ca1f00fc111cadf"));
Arrays.asList("4dfea9a9b1a77c4c6b9edc61f9ea8da2"));
executeTest("testUpdatePGT", spec);
}
@ -91,7 +91,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample", b37KGReference),
1,
Arrays.asList("88fa4a021e4aac9a0e48bd54b2949ece"));
Arrays.asList("a96b79e7c3689c8d5506083cb6d27390"));
executeTest("testUpdatePGT, adding StrandAlleleCountsBySample annotation", spec);
}
@ -103,7 +103,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("06b4e2589c5b903f7c51ae9968bebe77"));
Arrays.asList("bf3c1982ab6ffee410cb6a1fff6e7105"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -115,7 +115,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference),
1,
Arrays.asList("599394c205c1d6641b9bebabbd29e13c"));
Arrays.asList("47d454936dc1f17cf4c4f84f02841346"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -127,7 +127,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference),
1,
Arrays.asList("f7d5344a85e6d7fc2437d4253b424cb0"));
Arrays.asList("5d79ea9de8ada8520d01284cf0c9f720"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -139,7 +139,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
1,
Arrays.asList("c9e4d1e52ee1f3a5233f1fb100f24d5e"));
Arrays.asList("d69b43cac448f45218e77308fc01e9e6"));
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
}
@ -152,7 +152,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("aa19980b9a525afed43e98c821114ae5"));
Arrays.asList("7c93d82758bfb6e7efec257ef8a46217"));
executeTest("combineSingleSamplePipelineGVCFHierarchical", spec);
}
@ -164,7 +164,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("f23c9d62542a69b5cbf0e9f89fdd235d"));
Arrays.asList("5b60a7a9575ea83407aa61123960a0cc"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}
@ -174,7 +174,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference +
" -V " + privateTestDir + "gvcfExample1.vcf",
1,
Arrays.asList("d602d9e5d336798e4ccb52d2b5f91677"));
Arrays.asList("9e59b94c84dd673b8db9d35cae7e0f68"));
executeTest("testJustOneSample", spec);
}
@ -185,14 +185,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V " + privateTestDir + "gvcfExample1.vcf" +
" -V " + privateTestDir + "gvcfExample2.vcf",
1,
Arrays.asList("6c6d6ef90386eb6c6ed649379aac0c13"));
Arrays.asList("8407cb9a1ab34e705e5a54a0d4146d84"));
executeTest("testSamplesWithDifferentLs", spec);
}
@Test(enabled = true)
public void testNoPLsException() {
// Test with input files with (1) 0/0 and (2) ./.
final String md5 = "d04b32cf2fa97d303ff7fdc779a653d4";
final String md5 = "3e69805dc1c0ada0a050a65b89ecab30";
WalkerTestSpec spec1 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference +
" --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf",
@ -212,7 +212,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-nda"),
1,
Arrays.asList("7132a43d93a9855d03b27b4b0381194c"));
Arrays.asList("5a036de16b7a87626d2b76727376d9df"));
executeTest("testNDA", spec);
}
@ -221,7 +221,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-maxAltAlleles 1"),
1,
Arrays.asList("07844593a4e1ff1110ef8c1de42cc290"));
Arrays.asList("2f3e6879fa27128a8be7b067ded78966"));
executeTest("testMaxAltAlleles", spec);
}
@ -230,7 +230,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"),
1,
Arrays.asList("56caad762b26479ba5e2cc99222b9030"));
Arrays.asList("2e4a1ad71e8fc127b594077166c0344b"));
executeTest("testStandardConf", spec);
}
@ -274,7 +274,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" +
" --uniquifySamples", b37KGReference),
1,
Arrays.asList("ba36b36145e038e3cb004adf11bce96c"));
Arrays.asList("9a472c4e101fff4892efb9255c5cd8b3"));
executeTest("testUniquifiedSamples", spec);
}
@ -311,4 +311,173 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
return attributeValues;
}
/**
* Section to test spanning deletions
*/
@Test
public void testSpanningDeletions() throws IOException {
final String gvcf1 = privateTestDir + "spanningDel.1.g.vcf";
final String gvcf2 = privateTestDir + "spanningDel.2.g.vcf";
final String gvcf3 = privateTestDir + "spanningDel.3.g.vcf";
// create the genotyped VCF to use as a basis for comparison against all of the combined versions
// case 0: GenotypeGVCFs(1.g.vcf, 2.g.vcf, 3.g.vcf)
final WalkerTestSpec genotypeBase = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3,
1,
Arrays.asList(""));
genotypeBase.disableShadowBCF();
final File genotypeBaseVCF = executeTest("genotypeBase", genotypeBase).getFirst().get(0);
final List<VariantContext> BASE_VARIANT_CONTEXTS = getVariantContexts(genotypeBaseVCF);
// case 1: GenotypeGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf), 3.g.vcf)
final WalkerTestSpec combine12 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2,
1,
Arrays.asList(""));
combine12.disableShadowBCF();
final File combined_gVCF12 = executeTest("combine12", combine12).getFirst().get(0);
final WalkerTestSpec genotype12_3 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12.getAbsolutePath() + " -V " + gvcf3,
1,
Arrays.asList(""));
genotype12_3.disableShadowBCF();
final File genotype12_3VCF = executeTest("genotype12_3", genotype12_3).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_12_3 = getVariantContexts(genotype12_3VCF);
testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_12_3);
// case 2: GenotypeGVCFs(CombineGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf), 3.g.vcf))
final WalkerTestSpec combine12then3 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12 + " -V " + gvcf3,
1,
Arrays.asList(""));
combine12then3.disableShadowBCF();
final File combined_gVCF12then3 = executeTest("combined_gVCF12then3", combine12then3).getFirst().get(0);
final WalkerTestSpec genotype12then3 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12then3.getAbsolutePath(),
1,
Arrays.asList(""));
genotype12then3.disableShadowBCF();
final File genotype12then3VCF = executeTest("genotype12then3", genotype12then3).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_12then3 = getVariantContexts(genotype12then3VCF);
testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_12then3);
// case 3: GenotypeGVCFs(CombineGVCFs(CombineGVCFs(1.g.vcf, 3.g.vcf), 2.g.vcf))
final WalkerTestSpec combine13 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf3,
1,
Arrays.asList(""));
combine13.disableShadowBCF();
final File combined_gVCF13 = executeTest("combine13", combine13).getFirst().get(0);
final WalkerTestSpec combine13then2 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF13 + " -V " + gvcf2,
1,
Arrays.asList(""));
combine13then2.disableShadowBCF();
final File combined_gVCF13then2 = executeTest("combined_gVCF13then2", combine13then2).getFirst().get(0);
final WalkerTestSpec genotype13then2 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF13then2.getAbsolutePath(),
1,
Arrays.asList(""));
genotype13then2.disableShadowBCF();
final File genotype13then2VCF = executeTest("genotype13then2", genotype13then2).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_13then2 = getVariantContexts(genotype13then2VCF);
testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_13then2);
// case 4: GenotypeGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf, 3.g.vcf))
final WalkerTestSpec combine123 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3,
1,
Arrays.asList(""));
combine123.disableShadowBCF();
final File combined_gVCF123 = executeTest("combine123", combine123).getFirst().get(0);
final WalkerTestSpec genotype123 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF123.getAbsolutePath(),
1,
Arrays.asList(""));
genotype123.disableShadowBCF();
final File genotype123VCF = executeTest("genotype123", genotype123).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_123 = getVariantContexts(genotype123VCF);
testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_123);
}
/**
* Returns a list of VariantContext records from a VCF file
*
* @param vcfFile VCF file
*
* @throws IOException if the file does not exist or can not be opened
*
* @return list of VariantContext records
*/
private static List<VariantContext> getVariantContexts(final File vcfFile) throws IOException {
final VCFCodec codec = new VCFCodec();
final FileInputStream s = new FileInputStream(vcfFile);
final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s));
codec.readHeader(lineIteratorVCF);
final List<VariantContext> VCs = new ArrayList<>();
while ( lineIteratorVCF.hasNext() ) {
final String line = lineIteratorVCF.next();
Assert.assertFalse(line == null);
VCs.add(codec.decode(line));
}
return VCs;
}
private static void testVCsAreEqual(final List<VariantContext> VCs1, final List<VariantContext> VCs2) {
Assert.assertEquals(VCs1.size(), VCs2.size(), "number of Variant Contexts");
for ( int i = 0; i < VCs1.size(); i++ ) {
final VariantContext vc1 = VCs1.get(i);
final VariantContext vc2 = VCs2.get(i);
Assert.assertEquals(vc1.toStringDecodeGenotypes(), vc2.toStringDecodeGenotypes());
}
}
private static final String simpleSpanningDeletionsMD5 = "e8616a396d40b4918ad30189856ceb01";
@Test(enabled = true)
public void testSpanningDeletionsMD5() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf",
1,
Arrays.asList(simpleSpanningDeletionsMD5));
spec.disableShadowBCF();
executeTest("testSpanningDeletionsMD5", spec);
}
@Test(enabled = true)
public void testSpanningDeletionsFromCombinedGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.combined.g.vcf",
1,
Arrays.asList(simpleSpanningDeletionsMD5));
spec.disableShadowBCF();
executeTest("testSpanningDeletionsFromCombinedGVCFMD5", spec);
}
@Test(enabled = true)
public void testMultipleSpanningDeletionsMD5() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf",
1,
Arrays.asList("1c418229117bc8f148a69eda9c496309"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsMD5", spec);
}
}

View File

@ -78,6 +78,7 @@ public class VariantContextMergerUnitTest extends BaseTest {
Allele ATref;
Allele Anoref;
Allele GT;
Allele del;
private GenomeLocParser genomeLocParser;
@ -94,6 +95,7 @@ public class VariantContextMergerUnitTest extends BaseTest {
ATCATCT = Allele.create("ATCATCT");
ATref = Allele.create("AT",true);
Anoref = Allele.create("A",false);
del = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE;
GT = Allele.create("GT",false);
genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference)));
}
@ -217,6 +219,7 @@ public class VariantContextMergerUnitTest extends BaseTest {
final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make();
final List<Allele> A_C_del = Arrays.asList(Aref, C, del);
// first test the case of a single record
tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT),
@ -249,18 +252,19 @@ public class VariantContextMergerUnitTest extends BaseTest {
// a SNP with a spanning deletion
tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT),
loc, false, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()});
new VariantContextBuilder(VCbase).alleles(A_C_del).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73}).alleles(noCalls).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 20, 72, 10}).alleles(noCalls).make()).make()});
// combination of all
tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT),
loc, false, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(),
new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(),
new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(),
new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()});
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC, del)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73, 71, 72, 73, 73, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73, 71, 73, 72, 73, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10, 71, 73, 73, 72, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74, 71, 72, 73, 74, 74}).alleles(noCalls).make(),
new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000, 100, 1000, 1000, 1000, 1000}).alleles(noCalls).make(),
new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800, 80, 800, 800, 800, 800}).alleles(noCalls).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73, 20, 72, 72, 72, 10}).alleles(noCalls).make()).make()});
// just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT),

View File

@ -123,7 +123,10 @@ public final class GATKVCFConstants {
public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC";
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
//Alleles
// Symbolic alleles
public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT";
public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF";
public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site
public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME = "*:DEL";
public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible spanning deletion allele at this site
}