diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index 53c885cec..b90163846 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -50,9 +50,9 @@ public class VCF4Codec implements FeatureCodec { // a set of the genotype keys? private String[] genotypeKeyArray = new String[100]; - // a list of the info fields, filter fields, and format fields, for quick lookup to validate against - ArrayList infoFields = new ArrayList(); - ArrayList formatFields = new ArrayList(); + // a mapping of the VCF fields to their type, filter fields, and format fields, for quick lookup to validate against + TreeMap infoFields = new TreeMap(); + TreeMap formatFields = new TreeMap(); ArrayList filterFields = new ArrayList(); // do we want to validate the info, format, and filter fields @@ -109,14 +109,12 @@ public class VCF4Codec implements FeatureCodec { if (hl.getClass() == VCFFilterHeaderLine.class) this.filterFields.add(((VCFFilterHeaderLine)hl).getmName()); if (hl.getClass() == VCFFormatHeaderLine.class) - this.formatFields.add(((VCFFormatHeaderLine)hl).getmName()); + this.formatFields.put(((VCFFormatHeaderLine)hl).getmName(),((VCFFormatHeaderLine)hl).getmType()); if (hl.getClass() == VCFInfoHeaderLine.class) - this.infoFields.add(((VCFInfoHeaderLine)hl).getmName()); + this.infoFields.put(((VCFInfoHeaderLine)hl).getmName(),((VCFInfoHeaderLine)hl).getmType()); } // sort the lists so we can binary search them later on Collections.sort(filterFields); - Collections.sort(formatFields); - Collections.sort(infoFields); return headerStrings.size(); } @@ -204,7 +202,21 @@ public class VCF4Codec implements FeatureCodec { int eqI = field.indexOf("="); if ( eqI != -1 ) { key = field.substring(0, eqI); - value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc + String str = field.substring(eqI+1, field.length()); + + // lets see if the string contains a , separator + if (str.contains(",")) { + List objects = new ArrayList(); + String[] split = str.split(","); + for (String substring : split) { + VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key); + objects.add(type.convert(substring)); + } + value = objects; + } else { + VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key); + value = type.convert(str); + } //System.out.printf("%s %s%n", key, value); } else { key = field; @@ -215,7 +227,7 @@ public class VCF4Codec implements FeatureCodec { } } // validate the fields - validateFields(attributes.keySet(),infoFields); + validateFields(attributes.keySet(),new ArrayList(infoFields.keySet())); attributes.put("ID", id); return attributes; @@ -255,7 +267,8 @@ public class VCF4Codec implements FeatureCodec { private List parseAlleles(String ref, String alts) { List alleles = new ArrayList(2); // we are almost always biallelic // ref - checkAllele(ref, true); + if (!checkAllele(ref, true)) + throw new StingException("Unable to parse out correct reference allele, we saw = " + ref); Allele refAllele = Allele.create(ref, true); alleles.add(refAllele); @@ -272,11 +285,17 @@ public class VCF4Codec implements FeatureCodec { * check to make sure the allele is an acceptable allele * @param allele the allele to check * @param isRef are we the reference allele? + * @return true if the allele is fine, false otherwise */ - private static void checkAllele(String allele,boolean isRef) { - if ( ! Allele.acceptableAlleleBases(allele,isRef) ) { + private static boolean checkAllele(String allele,boolean isRef) { + if (allele.contains("<")) { + Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele); + return false; + } + else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) { throw new StingException("Unparsable vcf record with allele " + allele); } + return true; } /** @@ -286,7 +305,8 @@ public class VCF4Codec implements FeatureCodec { * @param isRef are we the reference allele? */ private void parseSingleAllele(List alleles, String alt, boolean isRef) { - checkAllele(alt,isRef); + if (!checkAllele(alt,isRef)) + throw new StingException("Unable to parse out correct alt allele, we saw = " + alt); Allele allele = Allele.create(alt, false); if ( ! allele.isNoCall() ) @@ -388,7 +408,7 @@ public class VCF4Codec implements FeatureCodec { } } // validate the format fields - validateFields(gtAttributes.keySet(), formatFields); + validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet())); } boolean phased = genotypeKeyArray[0].charAt(1) == '|'; @@ -397,7 +417,6 @@ public class VCF4Codec implements FeatureCodec { genotypes.put(g.getSampleName(), g); } } - // todo -- we need access to our track name to name the variant context return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); } @@ -431,7 +450,11 @@ public class VCF4Codec implements FeatureCodec { for (Allele a : unclippedAlleles) newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference())); - return new Pair>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+ref.length()-reverseClipped-1)), + + // the new reference length + int refLength = ref.length() - forwardClipping - reverseClipped; + + return new Pair>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+forwardClipping+Math.max(refLength - 1,0))), newAlleleList); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java index 4da25db74..86b0567d1 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java @@ -4,6 +4,8 @@ import org.broad.tribble.util.AsciiLineReader; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; @@ -20,6 +22,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Set; /** @@ -170,27 +173,52 @@ public class VCF4UnitTest extends BaseTest { } - // two constants for testing + // test too many info fields + String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HH\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; + @Test(expected=StingException.class) + public void testCheckTooManyInfoFields() { + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); + testSetup.codec.decode(twoManyInfoLine); + } + // test a regular line String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; - String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; - String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HG=12\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; - @Test public void testCheckInfoValidation() { TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); testSetup.codec.decode(regularLine); } - + // test too few info lines, we don't provide the DP in this line + String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; @Test public void testCheckTwoFewInfoValidation() { TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); testSetup.codec.decode(twoFewInfoLine); } - @Test(expected=StingException.class) - public void testCheckTwoManyInfoValidation() { + // test that we're getting the right genotype for a multi-base polymorphism + String MNPLine = "20\t14370\trs6054257\tGG\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; + @Test + public void testMNPValidation() { TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); - testSetup.codec.decode(twoManyInfoLine); + VariantContext vc = (VariantContext)testSetup.codec.decode(MNPLine); + Map genotypes = vc.getGenotypes(); + Assert.assertTrue(genotypes.containsKey("NA00003")); + Genotype g = genotypes.get("NA00003"); + Assert.assertTrue("Expected AT genotype, saw = " + g.getAllele(0),"AT".equals(g.getAllele(0).toString())); + Assert.assertTrue(vc.getType()== VariantContext.Type.MNP); + } + + // test that we're getting the right genotype for what appears to be a multi-base polymorphism, but is really just a SNP + String MNPLine2 = "20\t14370\trs6054257\tGT\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,."; + @Test + public void testMNP2Validation() { + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); + VariantContext vc = (VariantContext)testSetup.codec.decode(MNPLine2); + Map genotypes = vc.getGenotypes(); + Assert.assertTrue(genotypes.containsKey("NA00003")); + Genotype g = genotypes.get("NA00003"); + Assert.assertTrue("Expected A genotype, saw = " + g.getAllele(0),"A".equals(g.getAllele(0).toString())); + Assert.assertTrue(vc.getType()== VariantContext.Type.SNP); } File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf"); @@ -218,8 +246,9 @@ public class VCF4UnitTest extends BaseTest { try { testSetup.codec.decode(line); } catch (Exception e) { - System.err.println(e.getMessage() + " -> " + line); - System.err.println(line); + //System.err.println(e.getMessage() + " -> " + line); + //System.err.println(line); + Assert.fail("Bad record from line " + line + " message = " + e.getMessage()); badRecordCount++; } line = reader.readLine();