Merge pull request #938 from broadinstitute/eb_fix_spanning_deletions_in_genotyping

Added a fix for genotyping positions over spanning deletions.
This commit is contained in:
Eric Banks 2015-05-11 23:11:47 -04:00
commit 53a34cea4a
8 changed files with 288 additions and 92 deletions

View File

@ -88,9 +88,6 @@ import java.util.*;
*/
public class ReferenceConfidenceModel {
//public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now
public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele
private final GenomeLocParser genomeLocParser;
private final SampleList samples;
@ -138,9 +135,7 @@ public class ReferenceConfidenceModel {
*/
public Set<VCFHeaderLine> getVCFHeaderLines() {
final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>();
// TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles?
headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
//headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
return headerLines;
}

View File

@ -165,6 +165,8 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
// take care of the VCF headers
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location"));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags
final Set<String> samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
final VCFHeader vcfHeader = new VCFHeader(headerLines, samples);

View File

@ -217,10 +217,13 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions());
headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
// add the pool values for each genotype
// add headers for annotations added by this tool
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location"));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags
if ( dbsnp != null && dbsnp.dbsnp.isBound() )
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);

View File

@ -113,20 +113,27 @@ public class ReferenceConfidenceVariantContextMerger {
final Map<String, List<Comparable>> annotationMap = new LinkedHashMap<>();
final GenotypesContext genotypes = GenotypesContext.create();
final int variantContextCount = VCs.size();
// In this list we hold the mapping of each variant context alleles.
final List<Pair<VariantContext,List<Allele>>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount);
final List<Pair<VariantContext,List<Allele>>> vcAndNewAllelePairs = new ArrayList<>(VCs.size());
// Keep track of whether we saw a spanning deletion and a non-spanning event
boolean sawSpanningDeletion = false;
boolean sawNonSpanningEvent = false;
// cycle through and add info from the other VCs
for ( final VariantContext vc : VCs ) {
// if this context doesn't start at the current location then it must be a spanning event (deletion or ref block)
final boolean isSpanningEvent = loc.getStart() != vc.getStart();
// record whether it's also a spanning deletion/event (we know this because the VariantContext type is no
// longer "symbolic" but "mixed" because there are real alleles mixed in with the symbolic non-ref allele)
sawSpanningDeletion |= ( isSpanningEvent && vc.isMixed() ) || vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE);
sawNonSpanningEvent |= ( !isSpanningEvent && vc.isMixed() );
vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles())
: remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet)));
vcAndNewAllelePairs.add(new Pair<>(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet)));
}
// Add <NON_REF> to the end if at all required in in the output.
// Add <DEL> and <NON_REF> to the end if at all required in in the output.
if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE);
if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
@ -171,13 +178,27 @@ public class ReferenceConfidenceVariantContextMerger {
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
// note that in order to calculate the end position, we need a list of alleles that doesn't include anything symbolic
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList)
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart())
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(nonSymbolicAlleles(allelesList), loc.getStart(), loc.getStart())
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later
return builder.make();
}
/**
* @param list the original alleles list
* @return a non-null list of non-symbolic alleles
*/
private static List<Allele> nonSymbolicAlleles(final List<Allele> list) {
final List<Allele> result = new ArrayList<>(list.size());
for ( final Allele allele : list ) {
if ( !allele.isSymbolic() )
result.add(allele);
}
return result;
}
/**
* Determines the ref allele given the provided reference base at this position
*
@ -246,27 +267,28 @@ public class ReferenceConfidenceVariantContextMerger {
* collects alternative alleles present in variant context and add them to the {@code finalAlleles} set.
* </li></ul>
*
* @param vcAlleles the variant context allele list.
* @param refAllele final reference allele.
* @param vc the variant context.
* @param refAllele final reference allele.
* @param finalAlleles where to add the final set of non-ref called alleles.
* @return never {@code null}
*/
//TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}.
private static List<Allele> remapAlleles(final List<Allele> vcAlleles, final Allele refAllele, final LinkedHashSet<Allele> finalAlleles) {
final Allele vcRef = vcAlleles.get(0);
if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference");
private static List<Allele> remapAlleles(final VariantContext vc, final Allele refAllele, final LinkedHashSet<Allele> finalAlleles) {
final Allele vcRef = vc.getReference();
final byte[] refBases = refAllele.getBases();
final int extraBaseCount = refBases.length - vcRef.getBases().length;
if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected");
final List<Allele> result = new ArrayList<>(vcAlleles.size());
for (final Allele a : vcAlleles) {
if (a.isReference()) {
result.add(refAllele);
} else if (a.isSymbolic()) {
final List<Allele> result = new ArrayList<>(vc.getNAlleles());
result.add(refAllele);
for (final Allele a : vc.getAlternateAlleles()) {
if (a.isSymbolic()) {
result.add(a);
// we always skip <NON_REF> when adding to finalAlleles this is done outside if applies.
if (!a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE))
// we always skip <NON_REF> when adding to finalAlleles; this is done outside if applies.
// we also skip <*DEL> if there isn't a real alternate allele.
if ( !a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && !vc.isSymbolic() )
finalAlleles.add(a);
} else if (a.isCalled()) {
final Allele newAllele;
@ -287,17 +309,31 @@ public class ReferenceConfidenceVariantContextMerger {
}
/**
* Replaces any alleles in the list with NO CALLS, except for the generic ALT allele
* Replaces any alleles in the VariantContext with NO CALLS or the symbolic deletion allele as appropriate, except for the generic ALT allele
*
* @param alleles list of alleles to replace
* @param vc VariantContext with the alleles to replace
* @return non-null list of alleles
*/
private static List<Allele> replaceWithNoCalls(final List<Allele> alleles) {
if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null");
private static List<Allele> replaceWithNoCallsAndDels(final VariantContext vc) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
final List<Allele> result = new ArrayList<>(alleles.size());
for ( final Allele allele : alleles )
result.add(allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL);
final List<Allele> result = new ArrayList<>(vc.getNAlleles());
// no-call the reference allele
result.add(Allele.NO_CALL);
// handle the alternate alleles
for ( final Allele allele : vc.getAlternateAlleles() ) {
final Allele replacement;
if ( allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) )
replacement = allele;
else if ( allele.length() < vc.getReference().length() )
replacement = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE;
else
replacement = Allele.NO_CALL;
result.add(replacement);
}
return result;
}
@ -387,43 +423,16 @@ public class ReferenceConfidenceVariantContextMerger {
throw new UserException("The list of input alleles must contain " + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records");
final int indexOfNonRef = remappedAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
//if the refs don't match then let the non-ref allele be the most likely of the alts
//TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref
final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false);
final int indexOfBestAlt;
if (!refsMatch && g.hasPL()) {
final int[] PLs = g.getPL().clone();
PLs[0] = Integer.MAX_VALUE; //don't pick 0/0
final int indexOfBestAltPL = MathUtils.minElementIndex(PLs);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL);
indexOfBestAlt = pair.alleleIndex2;
}
else
indexOfBestAlt = indexOfNonRef;
final int[] indexMapping = new int[targetAlleles.size()];
// the reference likelihoods should always map to each other (even if the alleles don't)
indexMapping[0] = 0;
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
final int targetNonRef = targetAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final boolean targetHasNonRef = targetNonRef != -1;
final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1;
for ( int i = 1; i <= lastConcreteAlt; i++ ) {
// create the index mapping, using the <NON-REF> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
if (targetHasNonRef) {
if (refsMatch) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef));
indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
else {
indexMapping[targetNonRef] = indexOfBestAlt;
}
}
return indexMapping;
}

View File

@ -100,7 +100,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals",
1,
Arrays.asList("7b3153135e4f8e1d137d3f4beb46f182"));
Arrays.asList("ebe26077809961f53d5244643d24fd45"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -112,7 +112,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals",
1,
Arrays.asList("4f546634213ece6f08ec9258620b92bb"));
Arrays.asList("2d36a5f996cad47e5d05fcd78f6e572e"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -190,7 +190,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testMD5s() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b7c753452ab0c05f9cee538e420b87fa"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("83ea9f4a9aadb1218c21c9d3780e8009"));
spec.disableShadowBCF();
executeTest("testMD5s", spec);
}
@ -198,7 +198,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBasepairResolutionOutput() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --convertToBasePairResolution");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("bb6420ead95da4c72e76ca4bf5860ef0"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f153cb6e986efc9b50f0b8833fe5d3da"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionOutput", spec);
}
@ -206,16 +206,27 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBreakBlocks() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("dd31182124c4b78a8a03edb1e0cf618b"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6626ff272e7e76fba091f5bde4a1f963"));
spec.disableShadowBCF();
executeTest("testBreakBlocks", spec);
}
@Test
public void testSpanningDeletions() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf",
1,
Arrays.asList("fba48ce2bf8761366ff2cd0b45d0421f"));
spec.disableShadowBCF();
executeTest("testSpanningDeletions", spec);
}
@Test
public void testWrongReferenceBaseBugFix() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf"
+ " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input2.vcf") + " -o %s --no_cmdline_in_header");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("75d81c247bef5e394b4a2d4126aee2b3"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("331c1a4a6a72ea1617c1697a5d945d56"));
spec.disableShadowBCF();
executeTest("testWrongReferenceBaseBugFix",spec);
@ -224,7 +235,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBasepairResolutionInput() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("994ff5c219c683635eadc1624cbbda74"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("207e89b5677fbf0ef4d1ff768262cf0c"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionInput", spec);
}

View File

@ -82,7 +82,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
1,
Arrays.asList("23ff3e22262929138ca1f00fc111cadf"));
Arrays.asList("4dfea9a9b1a77c4c6b9edc61f9ea8da2"));
executeTest("testUpdatePGT", spec);
}
@ -91,7 +91,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample", b37KGReference),
1,
Arrays.asList("88fa4a021e4aac9a0e48bd54b2949ece"));
Arrays.asList("a96b79e7c3689c8d5506083cb6d27390"));
executeTest("testUpdatePGT, adding StrandAlleleCountsBySample annotation", spec);
}
@ -103,7 +103,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("06b4e2589c5b903f7c51ae9968bebe77"));
Arrays.asList("bf3c1982ab6ffee410cb6a1fff6e7105"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -115,7 +115,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference),
1,
Arrays.asList("599394c205c1d6641b9bebabbd29e13c"));
Arrays.asList("47d454936dc1f17cf4c4f84f02841346"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -127,7 +127,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference),
1,
Arrays.asList("f7d5344a85e6d7fc2437d4253b424cb0"));
Arrays.asList("5d79ea9de8ada8520d01284cf0c9f720"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -139,7 +139,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
1,
Arrays.asList("c9e4d1e52ee1f3a5233f1fb100f24d5e"));
Arrays.asList("d69b43cac448f45218e77308fc01e9e6"));
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
}
@ -152,7 +152,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("aa19980b9a525afed43e98c821114ae5"));
Arrays.asList("7c93d82758bfb6e7efec257ef8a46217"));
executeTest("combineSingleSamplePipelineGVCFHierarchical", spec);
}
@ -164,7 +164,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("f23c9d62542a69b5cbf0e9f89fdd235d"));
Arrays.asList("5b60a7a9575ea83407aa61123960a0cc"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}
@ -174,7 +174,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference +
" -V " + privateTestDir + "gvcfExample1.vcf",
1,
Arrays.asList("d602d9e5d336798e4ccb52d2b5f91677"));
Arrays.asList("9e59b94c84dd673b8db9d35cae7e0f68"));
executeTest("testJustOneSample", spec);
}
@ -185,14 +185,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V " + privateTestDir + "gvcfExample1.vcf" +
" -V " + privateTestDir + "gvcfExample2.vcf",
1,
Arrays.asList("6c6d6ef90386eb6c6ed649379aac0c13"));
Arrays.asList("8407cb9a1ab34e705e5a54a0d4146d84"));
executeTest("testSamplesWithDifferentLs", spec);
}
@Test(enabled = true)
public void testNoPLsException() {
// Test with input files with (1) 0/0 and (2) ./.
final String md5 = "d04b32cf2fa97d303ff7fdc779a653d4";
final String md5 = "3e69805dc1c0ada0a050a65b89ecab30";
WalkerTestSpec spec1 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference +
" --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf",
@ -212,7 +212,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-nda"),
1,
Arrays.asList("7132a43d93a9855d03b27b4b0381194c"));
Arrays.asList("5a036de16b7a87626d2b76727376d9df"));
executeTest("testNDA", spec);
}
@ -221,7 +221,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-maxAltAlleles 1"),
1,
Arrays.asList("07844593a4e1ff1110ef8c1de42cc290"));
Arrays.asList("2f3e6879fa27128a8be7b067ded78966"));
executeTest("testMaxAltAlleles", spec);
}
@ -230,7 +230,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"),
1,
Arrays.asList("56caad762b26479ba5e2cc99222b9030"));
Arrays.asList("2e4a1ad71e8fc127b594077166c0344b"));
executeTest("testStandardConf", spec);
}
@ -274,7 +274,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" +
" --uniquifySamples", b37KGReference),
1,
Arrays.asList("ba36b36145e038e3cb004adf11bce96c"));
Arrays.asList("9a472c4e101fff4892efb9255c5cd8b3"));
executeTest("testUniquifiedSamples", spec);
}
@ -311,4 +311,173 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
return attributeValues;
}
/**
* Section to test spanning deletions
*/
@Test
public void testSpanningDeletions() throws IOException {
final String gvcf1 = privateTestDir + "spanningDel.1.g.vcf";
final String gvcf2 = privateTestDir + "spanningDel.2.g.vcf";
final String gvcf3 = privateTestDir + "spanningDel.3.g.vcf";
// create the genotyped VCF to use as a basis for comparison against all of the combined versions
// case 0: GenotypeGVCFs(1.g.vcf, 2.g.vcf, 3.g.vcf)
final WalkerTestSpec genotypeBase = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3,
1,
Arrays.asList(""));
genotypeBase.disableShadowBCF();
final File genotypeBaseVCF = executeTest("genotypeBase", genotypeBase).getFirst().get(0);
final List<VariantContext> BASE_VARIANT_CONTEXTS = getVariantContexts(genotypeBaseVCF);
// case 1: GenotypeGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf), 3.g.vcf)
final WalkerTestSpec combine12 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2,
1,
Arrays.asList(""));
combine12.disableShadowBCF();
final File combined_gVCF12 = executeTest("combine12", combine12).getFirst().get(0);
final WalkerTestSpec genotype12_3 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12.getAbsolutePath() + " -V " + gvcf3,
1,
Arrays.asList(""));
genotype12_3.disableShadowBCF();
final File genotype12_3VCF = executeTest("genotype12_3", genotype12_3).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_12_3 = getVariantContexts(genotype12_3VCF);
testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_12_3);
// case 2: GenotypeGVCFs(CombineGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf), 3.g.vcf))
final WalkerTestSpec combine12then3 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12 + " -V " + gvcf3,
1,
Arrays.asList(""));
combine12then3.disableShadowBCF();
final File combined_gVCF12then3 = executeTest("combined_gVCF12then3", combine12then3).getFirst().get(0);
final WalkerTestSpec genotype12then3 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF12then3.getAbsolutePath(),
1,
Arrays.asList(""));
genotype12then3.disableShadowBCF();
final File genotype12then3VCF = executeTest("genotype12then3", genotype12then3).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_12then3 = getVariantContexts(genotype12then3VCF);
testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_12then3);
// case 3: GenotypeGVCFs(CombineGVCFs(CombineGVCFs(1.g.vcf, 3.g.vcf), 2.g.vcf))
final WalkerTestSpec combine13 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf3,
1,
Arrays.asList(""));
combine13.disableShadowBCF();
final File combined_gVCF13 = executeTest("combine13", combine13).getFirst().get(0);
final WalkerTestSpec combine13then2 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF13 + " -V " + gvcf2,
1,
Arrays.asList(""));
combine13then2.disableShadowBCF();
final File combined_gVCF13then2 = executeTest("combined_gVCF13then2", combine13then2).getFirst().get(0);
final WalkerTestSpec genotype13then2 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF13then2.getAbsolutePath(),
1,
Arrays.asList(""));
genotype13then2.disableShadowBCF();
final File genotype13then2VCF = executeTest("genotype13then2", genotype13then2).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_13then2 = getVariantContexts(genotype13then2VCF);
testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_13then2);
// case 4: GenotypeGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf, 3.g.vcf))
final WalkerTestSpec combine123 = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3,
1,
Arrays.asList(""));
combine123.disableShadowBCF();
final File combined_gVCF123 = executeTest("combine123", combine123).getFirst().get(0);
final WalkerTestSpec genotype123 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + combined_gVCF123.getAbsolutePath(),
1,
Arrays.asList(""));
genotype123.disableShadowBCF();
final File genotype123VCF = executeTest("genotype123", genotype123).getFirst().get(0);
final List<VariantContext> VARIANT_CONTEXTS_123 = getVariantContexts(genotype123VCF);
testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_123);
}
/**
* Returns a list of VariantContext records from a VCF file
*
* @param vcfFile VCF file
*
* @throws IOException if the file does not exist or can not be opened
*
* @return list of VariantContext records
*/
private static List<VariantContext> getVariantContexts(final File vcfFile) throws IOException {
final VCFCodec codec = new VCFCodec();
final FileInputStream s = new FileInputStream(vcfFile);
final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s));
codec.readHeader(lineIteratorVCF);
final List<VariantContext> VCs = new ArrayList<>();
while ( lineIteratorVCF.hasNext() ) {
final String line = lineIteratorVCF.next();
Assert.assertFalse(line == null);
VCs.add(codec.decode(line));
}
return VCs;
}
private static void testVCsAreEqual(final List<VariantContext> VCs1, final List<VariantContext> VCs2) {
Assert.assertEquals(VCs1.size(), VCs2.size(), "number of Variant Contexts");
for ( int i = 0; i < VCs1.size(); i++ ) {
final VariantContext vc1 = VCs1.get(i);
final VariantContext vc2 = VCs2.get(i);
Assert.assertEquals(vc1.toStringDecodeGenotypes(), vc2.toStringDecodeGenotypes());
}
}
private static final String simpleSpanningDeletionsMD5 = "e8616a396d40b4918ad30189856ceb01";
@Test(enabled = true)
public void testSpanningDeletionsMD5() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf",
1,
Arrays.asList(simpleSpanningDeletionsMD5));
spec.disableShadowBCF();
executeTest("testSpanningDeletionsMD5", spec);
}
@Test(enabled = true)
public void testSpanningDeletionsFromCombinedGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.combined.g.vcf",
1,
Arrays.asList(simpleSpanningDeletionsMD5));
spec.disableShadowBCF();
executeTest("testSpanningDeletionsFromCombinedGVCFMD5", spec);
}
@Test(enabled = true)
public void testMultipleSpanningDeletionsMD5() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf",
1,
Arrays.asList("1c418229117bc8f148a69eda9c496309"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsMD5", spec);
}
}

View File

@ -78,6 +78,7 @@ public class VariantContextMergerUnitTest extends BaseTest {
Allele ATref;
Allele Anoref;
Allele GT;
Allele del;
private GenomeLocParser genomeLocParser;
@ -94,6 +95,7 @@ public class VariantContextMergerUnitTest extends BaseTest {
ATCATCT = Allele.create("ATCATCT");
ATref = Allele.create("AT",true);
Anoref = Allele.create("A",false);
del = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE;
GT = Allele.create("GT",false);
genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference)));
}
@ -217,6 +219,7 @@ public class VariantContextMergerUnitTest extends BaseTest {
final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make();
final List<Allele> A_C_del = Arrays.asList(Aref, C, del);
// first test the case of a single record
tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT),
@ -249,18 +252,19 @@ public class VariantContextMergerUnitTest extends BaseTest {
// a SNP with a spanning deletion
tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT),
loc, false, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()});
new VariantContextBuilder(VCbase).alleles(A_C_del).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73}).alleles(noCalls).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 20, 72, 10}).alleles(noCalls).make()).make()});
// combination of all
tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT),
loc, false, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(),
new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(),
new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(),
new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()});
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC, del)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73, 71, 72, 73, 73, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73, 71, 73, 72, 73, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10, 71, 73, 73, 72, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74, 71, 72, 73, 74, 74}).alleles(noCalls).make(),
new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000, 100, 1000, 1000, 1000, 1000}).alleles(noCalls).make(),
new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800, 80, 800, 800, 800, 800}).alleles(noCalls).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73, 20, 72, 72, 72, 10}).alleles(noCalls).make()).make()});
// just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT),

View File

@ -123,7 +123,10 @@ public final class GATKVCFConstants {
public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC";
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
//Alleles
// Symbolic alleles
public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT";
public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF";
public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site
public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME = "*:DEL";
public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible spanning deletion allele at this site
}