two fixes for the VCF 4 parser:

- Allow the "GT" field in genotypes at any point in the genotype string (before we required they be the first key-value pair).
- Fix a bug with the phasing value put into the VariantContext, thanks for the catch Guillermo!

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3638 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-25 18:01:23 +00:00
parent e15fe6858e
commit b3edb7dc08
2 changed files with 44 additions and 15 deletions

View File

@ -175,7 +175,8 @@ public class VCF4Codec implements FeatureCodec {
*/
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
// this should cache results [since they are immutable] and return a single object for each genotype
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
if ( GT.length() != 3 )
throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
List<Allele> GTAlleles = cache.get(GT);
if ( GTAlleles == null ) {
GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles));
@ -377,8 +378,7 @@ public class VCF4Codec implements FeatureCodec {
// do we have genotyping data
if (parts.length > 8) {
int genotypesStart = 9;
genotypes = createGenotypeMap(parts, locAndAlleles, genotypesStart);
genotypes = createGenotypeMap(parts, locAndAlleles, 8);
}
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
}
@ -387,14 +387,14 @@ public class VCF4Codec implements FeatureCodec {
* create a genotype map
* @param parts the string parts
* @param locAndAlleles the locations and the list of alleles
* @param genotypesStart the position in the parts array that the genotype strings start
* @param formatFieldLocation the position in the parts array that the genotype strings start
* @return a mapping of sample name to genotype object
*/
private Map<String, Genotype> createGenotypeMap(String[] parts, Pair<GenomeLoc, List<Allele>> locAndAlleles, int genotypesStart) {
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
protected Map<String, Genotype> createGenotypeMap(String[] parts, Pair<GenomeLoc, List<Allele>> locAndAlleles, int formatFieldLocation) {
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(Math.max(parts.length - formatFieldLocation, 1));
// get the format keys
int nGTKeys = ParsingUtils.split(parts[8], genotypeKeyArray, ':');
int nGTKeys = ParsingUtils.split(parts[formatFieldLocation], genotypeKeyArray, ':');
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
@ -403,9 +403,9 @@ public class VCF4Codec implements FeatureCodec {
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) {
for (int genotypeOffset = formatFieldLocation + 1; genotypeOffset < parts.length; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(parts[genotypeOffset], GTValueArray, ':');
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValueArray[0], locAndAlleles.second, alleleMap);
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
Set<String> genotypeFilters = null;
String sampleName = sampleNameIterator.next();
@ -418,11 +418,17 @@ public class VCF4Codec implements FeatureCodec {
if (nGTKeys < GTValueSplitSize)
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys > 1) {
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
for (int i = 1; i < nGTKeys; i++) {
for (int i = 0; i < nGTKeys; i++) {
if (i >= GTValueSplitSize)
gtAttributes.put(genotypeKeyArray[i],".");
else if (genotypeKeyArray[i].equals("GT"))
if (genotypeAlleleLocation >= 0)
throw new StingException("Saw two GT fields in record at position " + locAndAlleles.first);
else
genotypeAlleleLocation = i;
else if (genotypeKeyArray[i].equals("GQ"))
GTQual = parseQual(GTValueArray[i]);
else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
@ -434,11 +440,20 @@ public class VCF4Codec implements FeatureCodec {
// validate the format fields
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
}
// check to make sure we found a gentoype field
if (genotypeAlleleLocation < 0) throw new StingException("Unable to find required field GT for record " + locAndAlleles.first);
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
// assuming allele list length in the single digits, could be bad
boolean phased = GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
// add it to the list
genotypes.put(sampleName, new Genotype(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], locAndAlleles.second, alleleMap),
GTQual,
genotypeFilters,
gtAttributes,
phased));
Genotype g = new Genotype(sampleName, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
genotypes.put(g.getSampleName(), g);
}
return genotypes;
}

View File

@ -221,9 +221,9 @@ public class VCF4UnitTest extends BaseTest {
Assert.assertTrue(vc.getType()== VariantContext.Type.SNP);
}
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
File largeVCF = new File("yri.vcf"); // change to whatever file you'd like to test in the following test
//@Test
// @Test uncomment to re-enable testing
public void checkLargeVCF() {
TestSetup testSetup = new TestSetup().invoke(largeVCF);
AsciiLineReader reader = testSetup.getReader();
@ -435,6 +435,20 @@ public class VCF4UnitTest extends BaseTest {
Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG"));
}
@Test
public void testGenotypeConversionPhasing() {
String[] parts = {"GT:GD:DP", "0|0", "0|1", "1\\1"};
List<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create("A", true));
alleles.add(Allele.create("G", false));
Pair<GenomeLoc, List<Allele>> locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc("1",1),alleles);
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
Map<String, Genotype> genotypes = testSetup.getCodec().createGenotypeMap(parts, locAndAlleles,0);
// assert the first genotype is phased, and the third is not
Assert.assertTrue(genotypes.get("NA00001").genotypesArePhased());
Assert.assertTrue(!genotypes.get("NA00003").genotypesArePhased());
}
/**
* a test setup for the VCF 4 codec
*/