diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index c13f0700c..82921e6f4 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -175,7 +175,8 @@ public class VCF4Codec implements FeatureCodec { */ private static List parseGenotypeAlleles(String GT, List alleles, Map> cache) { // this should cache results [since they are immutable] and return a single object for each genotype - if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0 + if ( GT.length() != 3 ) + throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0 List GTAlleles = cache.get(GT); if ( GTAlleles == null ) { GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles)); @@ -377,8 +378,7 @@ public class VCF4Codec implements FeatureCodec { // do we have genotyping data if (parts.length > 8) { - int genotypesStart = 9; - genotypes = createGenotypeMap(parts, locAndAlleles, genotypesStart); + genotypes = createGenotypeMap(parts, locAndAlleles, 8); } return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes); } @@ -387,14 +387,14 @@ public class VCF4Codec implements FeatureCodec { * create a genotype map * @param parts the string parts * @param locAndAlleles the locations and the list of alleles - * @param genotypesStart the position in the parts array that the genotype strings start + * @param formatFieldLocation the position in the parts array that the genotype strings start * @return a mapping of sample name to genotype object */ - private Map createGenotypeMap(String[] parts, Pair> locAndAlleles, int genotypesStart) { - Map genotypes = new LinkedHashMap(Math.max(parts.length - genotypesStart, 1)); + protected Map createGenotypeMap(String[] parts, Pair> locAndAlleles, int formatFieldLocation) { + Map genotypes = new LinkedHashMap(Math.max(parts.length - formatFieldLocation, 1)); // get the format keys - int nGTKeys = ParsingUtils.split(parts[8], genotypeKeyArray, ':'); + int nGTKeys = ParsingUtils.split(parts[formatFieldLocation], genotypeKeyArray, ':'); // cycle through the sample names Iterator sampleNameIterator = header.getGenotypeSamples().iterator(); @@ -403,9 +403,9 @@ public class VCF4Codec implements FeatureCodec { alleleMap.clear(); // cycle through the genotype strings - for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) { + for (int genotypeOffset = formatFieldLocation + 1; genotypeOffset < parts.length; genotypeOffset++) { int GTValueSplitSize = ParsingUtils.split(parts[genotypeOffset], GTValueArray, ':'); - List genotypeAlleles = parseGenotypeAlleles(GTValueArray[0], locAndAlleles.second, alleleMap); + double GTQual = VariantContext.NO_NEG_LOG_10PERROR; Set genotypeFilters = null; String sampleName = sampleNameIterator.next(); @@ -418,11 +418,17 @@ public class VCF4Codec implements FeatureCodec { if (nGTKeys < GTValueSplitSize) throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]); + int genotypeAlleleLocation = -1; if (nGTKeys > 1) { gtAttributes = new HashMap(nGTKeys - 1); - for (int i = 1; i < nGTKeys; i++) { + for (int i = 0; i < nGTKeys; i++) { if (i >= GTValueSplitSize) gtAttributes.put(genotypeKeyArray[i],"."); + else if (genotypeKeyArray[i].equals("GT")) + if (genotypeAlleleLocation >= 0) + throw new StingException("Saw two GT fields in record at position " + locAndAlleles.first); + else + genotypeAlleleLocation = i; else if (genotypeKeyArray[i].equals("GQ")) GTQual = parseQual(GTValueArray[i]); else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here @@ -434,11 +440,20 @@ public class VCF4Codec implements FeatureCodec { // validate the format fields validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet())); } + // check to make sure we found a gentoype field + if (genotypeAlleleLocation < 0) throw new StingException("Unable to find required field GT for record " + locAndAlleles.first); - boolean phased = genotypeKeyArray[0].charAt(1) == '|'; + // assuming allele list length in the single digits, could be bad + boolean phased = GTValueArray[genotypeAlleleLocation].charAt(1) == '|'; + + // add it to the list + genotypes.put(sampleName, new Genotype(sampleName, + parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], locAndAlleles.second, alleleMap), + GTQual, + genotypeFilters, + gtAttributes, + phased)); - Genotype g = new Genotype(sampleName, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased); - genotypes.put(g.getSampleName(), g); } return genotypes; } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java index 0a34d5628..763087440 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java @@ -221,9 +221,9 @@ public class VCF4UnitTest extends BaseTest { Assert.assertTrue(vc.getType()== VariantContext.Type.SNP); } - File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf"); + File largeVCF = new File("yri.vcf"); // change to whatever file you'd like to test in the following test - //@Test + // @Test uncomment to re-enable testing public void checkLargeVCF() { TestSetup testSetup = new TestSetup().invoke(largeVCF); AsciiLineReader reader = testSetup.getReader(); @@ -435,6 +435,20 @@ public class VCF4UnitTest extends BaseTest { Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG")); } + @Test + public void testGenotypeConversionPhasing() { + String[] parts = {"GT:GD:DP", "0|0", "0|1", "1\\1"}; + List alleles = new ArrayList(); + alleles.add(Allele.create("A", true)); + alleles.add(Allele.create("G", false)); + Pair> locAndAlleles = new Pair>(GenomeLocParser.createGenomeLoc("1",1),alleles); + TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile); + Map genotypes = testSetup.getCodec().createGenotypeMap(parts, locAndAlleles,0); + // assert the first genotype is phased, and the third is not + Assert.assertTrue(genotypes.get("NA00001").genotypesArePhased()); + Assert.assertTrue(!genotypes.get("NA00003").genotypesArePhased()); + } + /** * a test setup for the VCF 4 codec */