a couple of VCF 4 improvements:
-Validation of INFO and FORMAT fields. -Conversion to the the correct type for info fields (i.e. allele frequency is now stored as a float instead of a string). -Checks for CNV style alternate allele encodings( i.e. <INS:ME:L1>), right now we exception out. Maybe we should just warn the user? -Tests for the multiple-base polymorphism allele case. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3622 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
54ae0b8e4e
commit
611d834092
|
|
@ -50,9 +50,9 @@ public class VCF4Codec implements FeatureCodec {
|
|||
// a set of the genotype keys?
|
||||
private String[] genotypeKeyArray = new String[100];
|
||||
|
||||
// a list of the info fields, filter fields, and format fields, for quick lookup to validate against
|
||||
ArrayList<String> infoFields = new ArrayList<String>();
|
||||
ArrayList<String> formatFields = new ArrayList<String>();
|
||||
// a mapping of the VCF fields to their type, filter fields, and format fields, for quick lookup to validate against
|
||||
TreeMap<String, VCFInfoHeaderLine.INFO_TYPE> infoFields = new TreeMap<String, VCFInfoHeaderLine.INFO_TYPE>();
|
||||
TreeMap<String, VCFFormatHeaderLine.FORMAT_TYPE> formatFields = new TreeMap<String, VCFFormatHeaderLine.FORMAT_TYPE>();
|
||||
ArrayList<String> filterFields = new ArrayList<String>();
|
||||
|
||||
// do we want to validate the info, format, and filter fields
|
||||
|
|
@ -109,14 +109,12 @@ public class VCF4Codec implements FeatureCodec {
|
|||
if (hl.getClass() == VCFFilterHeaderLine.class)
|
||||
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
|
||||
if (hl.getClass() == VCFFormatHeaderLine.class)
|
||||
this.formatFields.add(((VCFFormatHeaderLine)hl).getmName());
|
||||
this.formatFields.put(((VCFFormatHeaderLine)hl).getmName(),((VCFFormatHeaderLine)hl).getmType());
|
||||
if (hl.getClass() == VCFInfoHeaderLine.class)
|
||||
this.infoFields.add(((VCFInfoHeaderLine)hl).getmName());
|
||||
this.infoFields.put(((VCFInfoHeaderLine)hl).getmName(),((VCFInfoHeaderLine)hl).getmType());
|
||||
}
|
||||
// sort the lists so we can binary search them later on
|
||||
Collections.sort(filterFields);
|
||||
Collections.sort(formatFields);
|
||||
Collections.sort(infoFields);
|
||||
|
||||
return headerStrings.size();
|
||||
}
|
||||
|
|
@ -204,7 +202,21 @@ public class VCF4Codec implements FeatureCodec {
|
|||
int eqI = field.indexOf("=");
|
||||
if ( eqI != -1 ) {
|
||||
key = field.substring(0, eqI);
|
||||
value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc
|
||||
String str = field.substring(eqI+1, field.length());
|
||||
|
||||
// lets see if the string contains a , separator
|
||||
if (str.contains(",")) {
|
||||
List<Object> objects = new ArrayList<Object>();
|
||||
String[] split = str.split(",");
|
||||
for (String substring : split) {
|
||||
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
|
||||
objects.add(type.convert(substring));
|
||||
}
|
||||
value = objects;
|
||||
} else {
|
||||
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
|
||||
value = type.convert(str);
|
||||
}
|
||||
//System.out.printf("%s %s%n", key, value);
|
||||
} else {
|
||||
key = field;
|
||||
|
|
@ -215,7 +227,7 @@ public class VCF4Codec implements FeatureCodec {
|
|||
}
|
||||
}
|
||||
// validate the fields
|
||||
validateFields(attributes.keySet(),infoFields);
|
||||
validateFields(attributes.keySet(),new ArrayList(infoFields.keySet()));
|
||||
|
||||
attributes.put("ID", id);
|
||||
return attributes;
|
||||
|
|
@ -255,7 +267,8 @@ public class VCF4Codec implements FeatureCodec {
|
|||
private List<Allele> parseAlleles(String ref, String alts) {
|
||||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||
// ref
|
||||
checkAllele(ref, true);
|
||||
if (!checkAllele(ref, true))
|
||||
throw new StingException("Unable to parse out correct reference allele, we saw = " + ref);
|
||||
Allele refAllele = Allele.create(ref, true);
|
||||
alleles.add(refAllele);
|
||||
|
||||
|
|
@ -272,11 +285,17 @@ public class VCF4Codec implements FeatureCodec {
|
|||
* check to make sure the allele is an acceptable allele
|
||||
* @param allele the allele to check
|
||||
* @param isRef are we the reference allele?
|
||||
* @return true if the allele is fine, false otherwise
|
||||
*/
|
||||
private static void checkAllele(String allele,boolean isRef) {
|
||||
if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
|
||||
private static boolean checkAllele(String allele,boolean isRef) {
|
||||
if (allele.contains("<")) {
|
||||
Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele);
|
||||
return false;
|
||||
}
|
||||
else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
|
||||
throw new StingException("Unparsable vcf record with allele " + allele);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -286,7 +305,8 @@ public class VCF4Codec implements FeatureCodec {
|
|||
* @param isRef are we the reference allele?
|
||||
*/
|
||||
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
|
||||
checkAllele(alt,isRef);
|
||||
if (!checkAllele(alt,isRef))
|
||||
throw new StingException("Unable to parse out correct alt allele, we saw = " + alt);
|
||||
|
||||
Allele allele = Allele.create(alt, false);
|
||||
if ( ! allele.isNoCall() )
|
||||
|
|
@ -388,7 +408,7 @@ public class VCF4Codec implements FeatureCodec {
|
|||
}
|
||||
}
|
||||
// validate the format fields
|
||||
validateFields(gtAttributes.keySet(), formatFields);
|
||||
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||
}
|
||||
|
||||
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
|
||||
|
|
@ -397,7 +417,6 @@ public class VCF4Codec implements FeatureCodec {
|
|||
genotypes.put(g.getSampleName(), g);
|
||||
}
|
||||
}
|
||||
// todo -- we need access to our track name to name the variant context
|
||||
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||
}
|
||||
|
||||
|
|
@ -431,7 +450,11 @@ public class VCF4Codec implements FeatureCodec {
|
|||
|
||||
for (Allele a : unclippedAlleles)
|
||||
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
|
||||
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+ref.length()-reverseClipped-1)),
|
||||
|
||||
// the new reference length
|
||||
int refLength = ref.length() - forwardClipping - reverseClipped;
|
||||
|
||||
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+forwardClipping+Math.max(refLength - 1,0))),
|
||||
newAlleleList);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ import org.broad.tribble.util.AsciiLineReader;
|
|||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
|
@ -20,6 +22,7 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
|
|
@ -170,27 +173,52 @@ public class VCF4UnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
|
||||
// two constants for testing
|
||||
// test too many info fields
|
||||
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HH\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
@Test(expected=StingException.class)
|
||||
public void testCheckTooManyInfoFields() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
testSetup.codec.decode(twoManyInfoLine);
|
||||
}
|
||||
// test a regular line
|
||||
String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HG=12\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
|
||||
@Test
|
||||
public void testCheckInfoValidation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
testSetup.codec.decode(regularLine);
|
||||
}
|
||||
|
||||
// test too few info lines, we don't provide the DP in this line
|
||||
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
@Test
|
||||
public void testCheckTwoFewInfoValidation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
testSetup.codec.decode(twoFewInfoLine);
|
||||
}
|
||||
|
||||
@Test(expected=StingException.class)
|
||||
public void testCheckTwoManyInfoValidation() {
|
||||
// test that we're getting the right genotype for a multi-base polymorphism
|
||||
String MNPLine = "20\t14370\trs6054257\tGG\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
@Test
|
||||
public void testMNPValidation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
testSetup.codec.decode(twoManyInfoLine);
|
||||
VariantContext vc = (VariantContext)testSetup.codec.decode(MNPLine);
|
||||
Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
Assert.assertTrue(genotypes.containsKey("NA00003"));
|
||||
Genotype g = genotypes.get("NA00003");
|
||||
Assert.assertTrue("Expected AT genotype, saw = " + g.getAllele(0),"AT".equals(g.getAllele(0).toString()));
|
||||
Assert.assertTrue(vc.getType()== VariantContext.Type.MNP);
|
||||
}
|
||||
|
||||
// test that we're getting the right genotype for what appears to be a multi-base polymorphism, but is really just a SNP
|
||||
String MNPLine2 = "20\t14370\trs6054257\tGT\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||
@Test
|
||||
public void testMNP2Validation() {
|
||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||
VariantContext vc = (VariantContext)testSetup.codec.decode(MNPLine2);
|
||||
Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
Assert.assertTrue(genotypes.containsKey("NA00003"));
|
||||
Genotype g = genotypes.get("NA00003");
|
||||
Assert.assertTrue("Expected A genotype, saw = " + g.getAllele(0),"A".equals(g.getAllele(0).toString()));
|
||||
Assert.assertTrue(vc.getType()== VariantContext.Type.SNP);
|
||||
}
|
||||
|
||||
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
||||
|
|
@ -218,8 +246,9 @@ public class VCF4UnitTest extends BaseTest {
|
|||
try {
|
||||
testSetup.codec.decode(line);
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getMessage() + " -> " + line);
|
||||
System.err.println(line);
|
||||
//System.err.println(e.getMessage() + " -> " + line);
|
||||
//System.err.println(line);
|
||||
Assert.fail("Bad record from line " + line + " message = " + e.getMessage());
|
||||
badRecordCount++;
|
||||
}
|
||||
line = reader.readLine();
|
||||
|
|
|
|||
Loading…
Reference in New Issue