a couple of VCF 4 improvements:

-Validation of INFO and FORMAT fields.
-Conversion to the the correct type for info fields (i.e. allele frequency is now stored as a float instead of a string).
-Checks for CNV style alternate allele encodings( i.e. <INS:ME:L1>), right now we exception out.  Maybe we should just warn the user?
-Tests for the multiple-base polymorphism allele case.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3622 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-23 20:21:43 +00:00
parent 54ae0b8e4e
commit 611d834092
2 changed files with 78 additions and 26 deletions

View File

@ -50,9 +50,9 @@ public class VCF4Codec implements FeatureCodec {
// a set of the genotype keys?
private String[] genotypeKeyArray = new String[100];
// a list of the info fields, filter fields, and format fields, for quick lookup to validate against
ArrayList<String> infoFields = new ArrayList<String>();
ArrayList<String> formatFields = new ArrayList<String>();
// a mapping of the VCF fields to their type, filter fields, and format fields, for quick lookup to validate against
TreeMap<String, VCFInfoHeaderLine.INFO_TYPE> infoFields = new TreeMap<String, VCFInfoHeaderLine.INFO_TYPE>();
TreeMap<String, VCFFormatHeaderLine.FORMAT_TYPE> formatFields = new TreeMap<String, VCFFormatHeaderLine.FORMAT_TYPE>();
ArrayList<String> filterFields = new ArrayList<String>();
// do we want to validate the info, format, and filter fields
@ -109,14 +109,12 @@ public class VCF4Codec implements FeatureCodec {
if (hl.getClass() == VCFFilterHeaderLine.class)
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
if (hl.getClass() == VCFFormatHeaderLine.class)
this.formatFields.add(((VCFFormatHeaderLine)hl).getmName());
this.formatFields.put(((VCFFormatHeaderLine)hl).getmName(),((VCFFormatHeaderLine)hl).getmType());
if (hl.getClass() == VCFInfoHeaderLine.class)
this.infoFields.add(((VCFInfoHeaderLine)hl).getmName());
this.infoFields.put(((VCFInfoHeaderLine)hl).getmName(),((VCFInfoHeaderLine)hl).getmType());
}
// sort the lists so we can binary search them later on
Collections.sort(filterFields);
Collections.sort(formatFields);
Collections.sort(infoFields);
return headerStrings.size();
}
@ -204,7 +202,21 @@ public class VCF4Codec implements FeatureCodec {
int eqI = field.indexOf("=");
if ( eqI != -1 ) {
key = field.substring(0, eqI);
value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc
String str = field.substring(eqI+1, field.length());
// lets see if the string contains a , separator
if (str.contains(",")) {
List<Object> objects = new ArrayList<Object>();
String[] split = str.split(",");
for (String substring : split) {
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
objects.add(type.convert(substring));
}
value = objects;
} else {
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
value = type.convert(str);
}
//System.out.printf("%s %s%n", key, value);
} else {
key = field;
@ -215,7 +227,7 @@ public class VCF4Codec implements FeatureCodec {
}
}
// validate the fields
validateFields(attributes.keySet(),infoFields);
validateFields(attributes.keySet(),new ArrayList(infoFields.keySet()));
attributes.put("ID", id);
return attributes;
@ -255,7 +267,8 @@ public class VCF4Codec implements FeatureCodec {
private List<Allele> parseAlleles(String ref, String alts) {
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
// ref
checkAllele(ref, true);
if (!checkAllele(ref, true))
throw new StingException("Unable to parse out correct reference allele, we saw = " + ref);
Allele refAllele = Allele.create(ref, true);
alleles.add(refAllele);
@ -272,11 +285,17 @@ public class VCF4Codec implements FeatureCodec {
* check to make sure the allele is an acceptable allele
* @param allele the allele to check
* @param isRef are we the reference allele?
* @return true if the allele is fine, false otherwise
*/
private static void checkAllele(String allele,boolean isRef) {
if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
private static boolean checkAllele(String allele,boolean isRef) {
if (allele.contains("<")) {
Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele);
return false;
}
else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
throw new StingException("Unparsable vcf record with allele " + allele);
}
return true;
}
/**
@ -286,7 +305,8 @@ public class VCF4Codec implements FeatureCodec {
* @param isRef are we the reference allele?
*/
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
checkAllele(alt,isRef);
if (!checkAllele(alt,isRef))
throw new StingException("Unable to parse out correct alt allele, we saw = " + alt);
Allele allele = Allele.create(alt, false);
if ( ! allele.isNoCall() )
@ -388,7 +408,7 @@ public class VCF4Codec implements FeatureCodec {
}
}
// validate the format fields
validateFields(gtAttributes.keySet(), formatFields);
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
}
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
@ -397,7 +417,6 @@ public class VCF4Codec implements FeatureCodec {
genotypes.put(g.getSampleName(), g);
}
}
// todo -- we need access to our track name to name the variant context
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
}
@ -431,7 +450,11 @@ public class VCF4Codec implements FeatureCodec {
for (Allele a : unclippedAlleles)
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+ref.length()-reverseClipped-1)),
// the new reference length
int refLength = ref.length() - forwardClipping - reverseClipped;
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+forwardClipping+Math.max(refLength - 1,0))),
newAlleleList);
}

View File

@ -4,6 +4,8 @@ import org.broad.tribble.util.AsciiLineReader;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
@ -20,6 +22,7 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
@ -170,27 +173,52 @@ public class VCF4UnitTest extends BaseTest {
}
// two constants for testing
// test too many info fields
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HH\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
@Test(expected=StingException.class)
public void testCheckTooManyInfoFields() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(twoManyInfoLine);
}
// test a regular line
String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HG=12\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
@Test
public void testCheckInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(regularLine);
}
// test too few info lines, we don't provide the DP in this line
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
@Test
public void testCheckTwoFewInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(twoFewInfoLine);
}
@Test(expected=StingException.class)
public void testCheckTwoManyInfoValidation() {
// test that we're getting the right genotype for a multi-base polymorphism
String MNPLine = "20\t14370\trs6054257\tGG\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
@Test
public void testMNPValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(twoManyInfoLine);
VariantContext vc = (VariantContext)testSetup.codec.decode(MNPLine);
Map<String, Genotype> genotypes = vc.getGenotypes();
Assert.assertTrue(genotypes.containsKey("NA00003"));
Genotype g = genotypes.get("NA00003");
Assert.assertTrue("Expected AT genotype, saw = " + g.getAllele(0),"AT".equals(g.getAllele(0).toString()));
Assert.assertTrue(vc.getType()== VariantContext.Type.MNP);
}
// test that we're getting the right genotype for what appears to be a multi-base polymorphism, but is really just a SNP
String MNPLine2 = "20\t14370\trs6054257\tGT\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
@Test
public void testMNP2Validation() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
VariantContext vc = (VariantContext)testSetup.codec.decode(MNPLine2);
Map<String, Genotype> genotypes = vc.getGenotypes();
Assert.assertTrue(genotypes.containsKey("NA00003"));
Genotype g = genotypes.get("NA00003");
Assert.assertTrue("Expected A genotype, saw = " + g.getAllele(0),"A".equals(g.getAllele(0).toString()));
Assert.assertTrue(vc.getType()== VariantContext.Type.SNP);
}
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
@ -218,8 +246,9 @@ public class VCF4UnitTest extends BaseTest {
try {
testSetup.codec.decode(line);
} catch (Exception e) {
System.err.println(e.getMessage() + " -> " + line);
System.err.println(line);
//System.err.println(e.getMessage() + " -> " + line);
//System.err.println(line);
Assert.fail("Bad record from line " + line + " message = " + e.getMessage());
badRecordCount++;
}
line = reader.readLine();