diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index da467c1dd..97e874542 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -25,6 +25,9 @@ import java.util.*; */ public class VCF4Codec implements FeatureCodec { + // a variant context flag for original allele strings + public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST"; + // we have to store the list of strings that make up the header until they're needed private VCFHeader header = null; @@ -364,6 +367,7 @@ public class VCF4Codec implements FeatureCodec { */ private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) { try { + // increment the line count lineNo++; // parse out the required fields @@ -376,15 +380,19 @@ public class VCF4Codec implements FeatureCodec { String filter = parts[6]; String info = parts[7]; - + // get our alleles, filters, and setup an attribute map List alleles = parseAlleles(ref, alts); Set filters = parseFilters(filter); Map attributes = parseInfo(info, id); // find out our current location, and clip the alleles down to their minimum length - Pair> locAndAlleles = (ref.length() > 1) ? - clipAlleles(contig, pos, ref, alleles) : - new Pair>(GenomeLocParser.createGenomeLoc(contig, pos), alleles); + Pair> locAndAlleles; + if (ref.length() > 1) { + attributes.put(ORIGINAL_ALLELE_LIST,alleles); + locAndAlleles = clipAlleles(contig, pos, ref, alleles); + } else { + locAndAlleles = new Pair>(GenomeLocParser.createGenomeLoc(contig, pos), alleles); + } // a map to store our genotypes Map genotypes = null; @@ -477,7 +485,11 @@ public class VCF4Codec implements FeatureCodec { /** * clip the alleles, based on the reference - * @param unclippedAlleles the list of alleles + * + * @param contig our contig position + * @param position the unadjusted start position (pre-clipping) + * @param ref the reference string + * @param unclippedAlleles the list of unclipped alleles * @return a list of alleles, clipped to the reference */ static Pair> clipAlleles(String contig, long position, String ref, List unclippedAlleles) { @@ -502,7 +514,6 @@ public class VCF4Codec implements FeatureCodec { if (clipping) reverseClipped++; } - for (Allele a : unclippedAlleles) newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference())); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java index 527f3e438..01311b0da 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java @@ -41,7 +41,7 @@ public class VCF4UnitTest extends BaseTest { } catch (FileNotFoundException e) { throw new StingException("unable to load the sequence dictionary"); } - GenomeLocParser.setupRefContigOrdering(seq); + GenomeLocParser.setupRefContigOrdering(seq.getSequenceDictionary()); } @Test @@ -197,13 +197,33 @@ public class VCF4UnitTest extends BaseTest { } // test too few info lines, we don't provide the DP in this line - String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51"; + String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51"; @Test public void testCheckTwoFewInfoValidation() { TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); testSetup.codec.decode(twoFewInfoLine); } + // test that the variant context adds attributes for the original alleles when clipped + String clippedAlleleLine = "20\t14370\trs6054257\tGGG\tG\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|1:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51"; + @Test + public void testClippedAllelesAddedAsAnnotation() { + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); + VariantContext context = (VariantContext)testSetup.codec.decode(clippedAlleleLine); + Assert.assertTrue(context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST)); + List alleles = (List)context.getAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST); + Assert.assertEquals("Expected allele list of 2, got " + alleles.size(),2,alleles.size()); + Assert.assertTrue(alleles.get(0).basesMatch("GGG")); + Assert.assertTrue(alleles.get(1).basesMatch("G")); + } + // test that when we don't clip the alleles, we don't see the annotation + @Test + public void testNoClippedAllelesNoAddedAsAnnotation() { + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); + VariantContext context = (VariantContext)testSetup.codec.decode(twoFewInfoLine); + Assert.assertTrue(!context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST)); + } + // test that we're getting the right genotype for a multi-base polymorphism String MNPLine = "20\t14370\trs6054257\tGG\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; @Test