two fixes for the VCF 4 parser:
- Allow the "GT" field in genotypes at any point in the genotype string (before we required they be the first key-value pair). - Fix a bug with the phasing value put into the VariantContext, thanks for the catch Guillermo! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3638 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e15fe6858e
commit
b3edb7dc08
|
|
@ -175,7 +175,8 @@ public class VCF4Codec implements FeatureCodec {
|
|||
*/
|
||||
private static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||
// this should cache results [since they are immutable] and return a single object for each genotype
|
||||
if ( GT.length() != 3 ) throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
||||
if ( GT.length() != 3 )
|
||||
throw new StingException("Unreasonable number of alleles"); // 0/1 => barf on 10/0
|
||||
List<Allele> GTAlleles = cache.get(GT);
|
||||
if ( GTAlleles == null ) {
|
||||
GTAlleles = Arrays.asList(oneAllele(GT.charAt(0), alleles), oneAllele(GT.charAt(2), alleles));
|
||||
|
|
@ -377,8 +378,7 @@ public class VCF4Codec implements FeatureCodec {
|
|||
|
||||
// do we have genotyping data
|
||||
if (parts.length > 8) {
|
||||
int genotypesStart = 9;
|
||||
genotypes = createGenotypeMap(parts, locAndAlleles, genotypesStart);
|
||||
genotypes = createGenotypeMap(parts, locAndAlleles, 8);
|
||||
}
|
||||
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||
}
|
||||
|
|
@ -387,14 +387,14 @@ public class VCF4Codec implements FeatureCodec {
|
|||
* create a genotype map
|
||||
* @param parts the string parts
|
||||
* @param locAndAlleles the locations and the list of alleles
|
||||
* @param genotypesStart the position in the parts array that the genotype strings start
|
||||
* @param formatFieldLocation the position in the parts array that the genotype strings start
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
private Map<String, Genotype> createGenotypeMap(String[] parts, Pair<GenomeLoc, List<Allele>> locAndAlleles, int genotypesStart) {
|
||||
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
|
||||
protected Map<String, Genotype> createGenotypeMap(String[] parts, Pair<GenomeLoc, List<Allele>> locAndAlleles, int formatFieldLocation) {
|
||||
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(Math.max(parts.length - formatFieldLocation, 1));
|
||||
|
||||
// get the format keys
|
||||
int nGTKeys = ParsingUtils.split(parts[8], genotypeKeyArray, ':');
|
||||
int nGTKeys = ParsingUtils.split(parts[formatFieldLocation], genotypeKeyArray, ':');
|
||||
|
||||
// cycle through the sample names
|
||||
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
|
||||
|
|
@ -403,9 +403,9 @@ public class VCF4Codec implements FeatureCodec {
|
|||
alleleMap.clear();
|
||||
|
||||
// cycle through the genotype strings
|
||||
for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) {
|
||||
for (int genotypeOffset = formatFieldLocation + 1; genotypeOffset < parts.length; genotypeOffset++) {
|
||||
int GTValueSplitSize = ParsingUtils.split(parts[genotypeOffset], GTValueArray, ':');
|
||||
List<Allele> genotypeAlleles = parseGenotypeAlleles(GTValueArray[0], locAndAlleles.second, alleleMap);
|
||||
|
||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||
Set<String> genotypeFilters = null;
|
||||
String sampleName = sampleNameIterator.next();
|
||||
|
|
@ -418,11 +418,17 @@ public class VCF4Codec implements FeatureCodec {
|
|||
if (nGTKeys < GTValueSplitSize)
|
||||
throw new StingException("Too few keys for compared to the value string " + sampleName + ", keys = " + parts[8] + " values = " + parts[genotypeOffset]);
|
||||
|
||||
int genotypeAlleleLocation = -1;
|
||||
if (nGTKeys > 1) {
|
||||
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
||||
for (int i = 1; i < nGTKeys; i++) {
|
||||
for (int i = 0; i < nGTKeys; i++) {
|
||||
if (i >= GTValueSplitSize)
|
||||
gtAttributes.put(genotypeKeyArray[i],".");
|
||||
else if (genotypeKeyArray[i].equals("GT"))
|
||||
if (genotypeAlleleLocation >= 0)
|
||||
throw new StingException("Saw two GT fields in record at position " + locAndAlleles.first);
|
||||
else
|
||||
genotypeAlleleLocation = i;
|
||||
else if (genotypeKeyArray[i].equals("GQ"))
|
||||
GTQual = parseQual(GTValueArray[i]);
|
||||
else if (genotypeKeyArray[i].equals("FL")) // deal with genotype filters here
|
||||
|
|
@ -434,11 +440,20 @@ public class VCF4Codec implements FeatureCodec {
|
|||
// validate the format fields
|
||||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||
}
|
||||
// check to make sure we found a gentoype field
|
||||
if (genotypeAlleleLocation < 0) throw new StingException("Unable to find required field GT for record " + locAndAlleles.first);
|
||||
|
||||
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
|
||||
// assuming allele list length in the single digits, could be bad
|
||||
boolean phased = GTValueArray[genotypeAlleleLocation].charAt(1) == '|';
|
||||
|
||||
// add it to the list
|
||||
genotypes.put(sampleName, new Genotype(sampleName,
|
||||
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], locAndAlleles.second, alleleMap),
|
||||
GTQual,
|
||||
genotypeFilters,
|
||||
gtAttributes,
|
||||
phased));
|
||||
|
||||
Genotype g = new Genotype(sampleName, genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
|
||||
genotypes.put(g.getSampleName(), g);
|
||||
}
|
||||
return genotypes;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -221,9 +221,9 @@ public class VCF4UnitTest extends BaseTest {
|
|||
Assert.assertTrue(vc.getType()== VariantContext.Type.SNP);
|
||||
}
|
||||
|
||||
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
||||
File largeVCF = new File("yri.vcf"); // change to whatever file you'd like to test in the following test
|
||||
|
||||
//@Test
|
||||
// @Test uncomment to re-enable testing
|
||||
public void checkLargeVCF() {
|
||||
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
||||
AsciiLineReader reader = testSetup.getReader();
|
||||
|
|
@ -435,6 +435,20 @@ public class VCF4UnitTest extends BaseTest {
|
|||
Assert.assertTrue(locAndList.second.get(2).toString().equals("GGGGGG"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGenotypeConversionPhasing() {
|
||||
String[] parts = {"GT:GD:DP", "0|0", "0|1", "1\\1"};
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create("A", true));
|
||||
alleles.add(Allele.create("G", false));
|
||||
Pair<GenomeLoc, List<Allele>> locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc("1",1),alleles);
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
Map<String, Genotype> genotypes = testSetup.getCodec().createGenotypeMap(parts, locAndAlleles,0);
|
||||
// assert the first genotype is phased, and the third is not
|
||||
Assert.assertTrue(genotypes.get("NA00001").genotypesArePhased());
|
||||
Assert.assertTrue(!genotypes.get("NA00003").genotypesArePhased());
|
||||
}
|
||||
|
||||
/**
|
||||
* a test setup for the VCF 4 codec
|
||||
*/
|
||||
|
|
|
|||
Loading…
Reference in New Issue