two fixes for the VCF 4 parser:
- Allow the "GT" field in genotypes at any point in the genotype string (before we required they be the first key-value pair). - Fix a bug with the phasing value put into the VariantContext, thanks for the catch Guillermo! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3638 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e15fe6858e
commit
b3edb7dc08
|
|
@ -175,7 +175,8 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
*/
|
*/
|
||||||
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||||
// this should cache results [since they are immutable] and return a single object for each genotype
|
// this should cache results [since they are immutable] and return a single object for each genotype
|
||||||
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
if ( GT.length() != 3 )
|
||||||
|
throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
||||||
List<Allele> GTAlleles = cache.get(GT);
|
List<Allele> GTAlleles = cache.get(GT);
|
||||||
if ( GTAlleles == null ) {
|
if ( GTAlleles == null ) {
|
||||||
GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles));
|
GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles));
|
||||||
|
|
@ -377,8 +378,7 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
|
|
||||||
// do we have genotyping data
|
// do we have genotyping data
|
||||||
if (parts.length > 8) {
|
if (parts.length > 8) {
|
||||||
int genotypesStart = 9;
|
genotypes = createGenotypeMap(parts, locAndAlleles, 8);
|
||||||
genotypes = createGenotypeMap(parts, locAndAlleles, genotypesStart);
|
|
||||||
}
|
}
|
||||||
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||||
}
|
}
|
||||||
|
|
@ -387,14 +387,14 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
* create a genotype map
|
* create a genotype map
|
||||||
* @param parts the string parts
|
* @param parts the string parts
|
||||||
* @param locAndAlleles the locations and the list of alleles
|
* @param locAndAlleles the locations and the list of alleles
|
||||||
* @param genotypesStart the position in the parts array that the genotype strings start
|
* @param formatFieldLocation the position in the parts array that the genotype strings start
|
||||||
* @return a mapping of sample name to genotype object
|
* @return a mapping of sample name to genotype object
|
||||||
*/
|
*/
|
||||||
private Map<String, Genotype> createGenotypeMap(String[] parts, Pair<GenomeLoc, List<Allele>> locAndAlleles, int genotypesStart) {
|
protected Map<String, Genotype> createGenotypeMap(String[] parts, Pair<GenomeLoc, List<Allele>> locAndAlleles, int formatFieldLocation) {
|
||||||
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
|
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(Math.max(parts.length - formatFieldLocation, 1));
|
||||||
|
|
||||||
// get the format keys
|
// get the format keys
|
||||||
int nGTKeys = ParsingUtils.split(parts[8], genotypeKeyArray, ':');
|
int nGTKeys = ParsingUtils.split(parts[formatFieldLocation], genotypeKeyArray, ':');
|
||||||
|
|
||||||
// cycle through the sample names
|
// cycle through the sample names
|
||||||
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
|
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
|
||||||
|
|
@ -403,9 +403,9 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
alleleMap.clear();
|
alleleMap.clear();
|
||||||
|
|
||||||
// cycle through the genotype strings
|
// cycle through the genotype strings
|
||||||
for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) {
|
for (int genotypeOffset = formatFieldLocation + 1; genotypeOffset < parts.length; genotypeOffset++) {
|
||||||
int GTValueSplitSize = ParsingUtils.split(parts[genotypeOffset], GTValueArray, ':');
|
int GTValueSplitSize = ParsingUtils.split(parts[genotypeOffset], GTValueArray, ':');
|
||||||
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValueArray[0], locAndAlleles.second, alleleMap);
|
|
||||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||||
Set<String> genotypeFilters = null;
|
Set<String> genotypeFilters = null;
|
||||||
String sampleName = sampleNameIterator.next();
|
String sampleName = sampleNameIterator.next();
|
||||||
|
|
@ -418,11 +418,17 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
if (nGTKeys < GTValueSplitSize)
|
if (nGTKeys < GTValueSplitSize)
|
||||||
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
|
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
|
||||||
|
|
||||||
|
int genotypeAlleleLocation = -1;
|
||||||
if (nGTKeys > 1) {
|
if (nGTKeys > 1) {
|
||||||
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
||||||
for (int i = 1; i < nGTKeys; i++) {
|
for (int i = 0; i < nGTKeys; i++) {
|
||||||
if (i >= GTValueSplitSize)
|
if (i >= GTValueSplitSize)
|
||||||
gtAttributes.put(genotypeKeyArray[i],".");
|
gtAttributes.put(genotypeKeyArray[i],".");
|
||||||
|
else if (genotypeKeyArray[i].equals("GT"))
|
||||||
|
if (genotypeAlleleLocation >= 0)
|
||||||
|
throw new StingException("Saw two GT fields in record at position " + locAndAlleles.first);
|
||||||
|
else
|
||||||
|
genotypeAlleleLocation = i;
|
||||||
else if (genotypeKeyArray[i].equals("GQ"))
|
else if (genotypeKeyArray[i].equals("GQ"))
|
||||||
GTQual = parseQual(GTValueArray[i]);
|
GTQual = parseQual(GTValueArray[i]);
|
||||||
else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
|
else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
|
||||||
|
|
@ -434,11 +440,20 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
// validate the format fields
|
// validate the format fields
|
||||||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||||
}
|
}
|
||||||
|
// check to make sure we found a gentoype field
|
||||||
|
if (genotypeAlleleLocation < 0) throw new StingException("Unable to find required field GT for record " + locAndAlleles.first);
|
||||||
|
|
||||||
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
|
// assuming allele list length in the single digits, could be bad
|
||||||
|
boolean phased = GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
|
||||||
|
|
||||||
|
// add it to the list
|
||||||
|
genotypes.put(sampleName, new Genotype(sampleName,
|
||||||
|
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], locAndAlleles.second, alleleMap),
|
||||||
|
GTQual,
|
||||||
|
genotypeFilters,
|
||||||
|
gtAttributes,
|
||||||
|
phased));
|
||||||
|
|
||||||
Genotype g = new Genotype(sampleName, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
|
||||||
genotypes.put(g.getSampleName(), g);
|
|
||||||
}
|
}
|
||||||
return genotypes;
|
return genotypes;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -221,9 +221,9 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
Assert.assertTrue(vc.getType()== VariantContext.Type.SNP);
|
Assert.assertTrue(vc.getType()== VariantContext.Type.SNP);
|
||||||
}
|
}
|
||||||
|
|
||||||
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
File largeVCF = new File("yri.vcf"); // change to whatever file you'd like to test in the following test
|
||||||
|
|
||||||
//@Test
|
// @Test uncomment to re-enable testing
|
||||||
public void checkLargeVCF() {
|
public void checkLargeVCF() {
|
||||||
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
||||||
AsciiLineReader reader = testSetup.getReader();
|
AsciiLineReader reader = testSetup.getReader();
|
||||||
|
|
@ -435,6 +435,20 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG"));
|
Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGenotypeConversionPhasing() {
|
||||||
|
String[] parts = {"GT:GD:DP", "0|0", "0|1", "1\\1"};
|
||||||
|
List<Allele> alleles = new ArrayList<Allele>();
|
||||||
|
alleles.add(Allele.create("A", true));
|
||||||
|
alleles.add(Allele.create("G", false));
|
||||||
|
Pair<GenomeLoc, List<Allele>> locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc("1",1),alleles);
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
|
Map<String, Genotype> genotypes = testSetup.getCodec().createGenotypeMap(parts, locAndAlleles,0);
|
||||||
|
// assert the first genotype is phased, and the third is not
|
||||||
|
Assert.assertTrue(genotypes.get("NA00001").genotypesArePhased());
|
||||||
|
Assert.assertTrue(!genotypes.get("NA00003").genotypesArePhased());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* a test setup for the VCF 4 codec
|
* a test setup for the VCF 4 codec
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue