a couple of VCF 4 improvements:
-Validation of INFO and FORMAT fields. -Conversion to the the correct type for info fields (i.e. allele frequency is now stored as a float instead of a string). -Checks for CNV style alternate allele encodings( i.e. <INS:ME:L1>), right now we exception out. Maybe we should just warn the user? -Tests for the multiple-base polymorphism allele case. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3622 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
54ae0b8e4e
commit
611d834092
|
|
@ -50,9 +50,9 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
// a set of the genotype keys?
|
// a set of the genotype keys?
|
||||||
private String[] genotypeKeyArray = new String[100];
|
private String[] genotypeKeyArray = new String[100];
|
||||||
|
|
||||||
// a list of the info fields, filter fields, and format fields, for quick lookup to validate against
|
// a mapping of the VCF fields to their type, filter fields, and format fields, for quick lookup to validate against
|
||||||
ArrayList<String> infoFields = new ArrayList<String>();
|
TreeMap<String, VCFInfoHeaderLine.INFO_TYPE> infoFields = new TreeMap<String, VCFInfoHeaderLine.INFO_TYPE>();
|
||||||
ArrayList<String> formatFields = new ArrayList<String>();
|
TreeMap<String, VCFFormatHeaderLine.FORMAT_TYPE> formatFields = new TreeMap<String, VCFFormatHeaderLine.FORMAT_TYPE>();
|
||||||
ArrayList<String> filterFields = new ArrayList<String>();
|
ArrayList<String> filterFields = new ArrayList<String>();
|
||||||
|
|
||||||
// do we want to validate the info, format, and filter fields
|
// do we want to validate the info, format, and filter fields
|
||||||
|
|
@ -109,14 +109,12 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
if (hl.getClass() == VCFFilterHeaderLine.class)
|
if (hl.getClass() == VCFFilterHeaderLine.class)
|
||||||
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
|
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
|
||||||
if (hl.getClass() == VCFFormatHeaderLine.class)
|
if (hl.getClass() == VCFFormatHeaderLine.class)
|
||||||
this.formatFields.add(((VCFFormatHeaderLine)hl).getmName());
|
this.formatFields.put(((VCFFormatHeaderLine)hl).getmName(),((VCFFormatHeaderLine)hl).getmType());
|
||||||
if (hl.getClass() == VCFInfoHeaderLine.class)
|
if (hl.getClass() == VCFInfoHeaderLine.class)
|
||||||
this.infoFields.add(((VCFInfoHeaderLine)hl).getmName());
|
this.infoFields.put(((VCFInfoHeaderLine)hl).getmName(),((VCFInfoHeaderLine)hl).getmType());
|
||||||
}
|
}
|
||||||
// sort the lists so we can binary search them later on
|
// sort the lists so we can binary search them later on
|
||||||
Collections.sort(filterFields);
|
Collections.sort(filterFields);
|
||||||
Collections.sort(formatFields);
|
|
||||||
Collections.sort(infoFields);
|
|
||||||
|
|
||||||
return headerStrings.size();
|
return headerStrings.size();
|
||||||
}
|
}
|
||||||
|
|
@ -204,7 +202,21 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
int eqI = field.indexOf("=");
|
int eqI = field.indexOf("=");
|
||||||
if ( eqI != -1 ) {
|
if ( eqI != -1 ) {
|
||||||
key = field.substring(0, eqI);
|
key = field.substring(0, eqI);
|
||||||
value = field.substring(eqI+1, field.length()); // todo -- needs to convert to int, double, etc
|
String str = field.substring(eqI+1, field.length());
|
||||||
|
|
||||||
|
// lets see if the string contains a , separator
|
||||||
|
if (str.contains(",")) {
|
||||||
|
List<Object> objects = new ArrayList<Object>();
|
||||||
|
String[] split = str.split(",");
|
||||||
|
for (String substring : split) {
|
||||||
|
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
|
||||||
|
objects.add(type.convert(substring));
|
||||||
|
}
|
||||||
|
value = objects;
|
||||||
|
} else {
|
||||||
|
VCFInfoHeaderLine.INFO_TYPE type = infoFields.get(key);
|
||||||
|
value = type.convert(str);
|
||||||
|
}
|
||||||
//System.out.printf("%s %s%n", key, value);
|
//System.out.printf("%s %s%n", key, value);
|
||||||
} else {
|
} else {
|
||||||
key = field;
|
key = field;
|
||||||
|
|
@ -215,7 +227,7 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// validate the fields
|
// validate the fields
|
||||||
validateFields(attributes.keySet(),infoFields);
|
validateFields(attributes.keySet(),new ArrayList(infoFields.keySet()));
|
||||||
|
|
||||||
attributes.put("ID", id);
|
attributes.put("ID", id);
|
||||||
return attributes;
|
return attributes;
|
||||||
|
|
@ -255,7 +267,8 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
private List<Allele> parseAlleles(String ref, String alts) {
|
private List<Allele> parseAlleles(String ref, String alts) {
|
||||||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||||
// ref
|
// ref
|
||||||
checkAllele(ref, true);
|
if (!checkAllele(ref, true))
|
||||||
|
throw new StingException("Unable to parse out correct reference allele, we saw = " + ref);
|
||||||
Allele refAllele = Allele.create(ref, true);
|
Allele refAllele = Allele.create(ref, true);
|
||||||
alleles.add(refAllele);
|
alleles.add(refAllele);
|
||||||
|
|
||||||
|
|
@ -272,11 +285,17 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
* check to make sure the allele is an acceptable allele
|
* check to make sure the allele is an acceptable allele
|
||||||
* @param allele the allele to check
|
* @param allele the allele to check
|
||||||
* @param isRef are we the reference allele?
|
* @param isRef are we the reference allele?
|
||||||
|
* @return true if the allele is fine, false otherwise
|
||||||
*/
|
*/
|
||||||
private static void checkAllele(String allele,boolean isRef) {
|
private static boolean checkAllele(String allele,boolean isRef) {
|
||||||
if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
|
if (allele.contains("<")) {
|
||||||
|
Utils.warnUser("We are currently unable to parse out CNV encodings in VCF, we saw the following allele = " + allele);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
|
||||||
throw new StingException("Unparsable vcf record with allele " + allele);
|
throw new StingException("Unparsable vcf record with allele " + allele);
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -286,7 +305,8 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
* @param isRef are we the reference allele?
|
* @param isRef are we the reference allele?
|
||||||
*/
|
*/
|
||||||
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
|
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
|
||||||
checkAllele(alt,isRef);
|
if (!checkAllele(alt,isRef))
|
||||||
|
throw new StingException("Unable to parse out correct alt allele, we saw = " + alt);
|
||||||
|
|
||||||
Allele allele = Allele.create(alt, false);
|
Allele allele = Allele.create(alt, false);
|
||||||
if ( ! allele.isNoCall() )
|
if ( ! allele.isNoCall() )
|
||||||
|
|
@ -388,7 +408,7 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// validate the format fields
|
// validate the format fields
|
||||||
validateFields(gtAttributes.keySet(), formatFields);
|
validateFields(gtAttributes.keySet(), new ArrayList(formatFields.keySet()));
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
|
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
|
||||||
|
|
@ -397,7 +417,6 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
genotypes.put(g.getSampleName(), g);
|
genotypes.put(g.getSampleName(), g);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// todo -- we need access to our track name to name the variant context
|
|
||||||
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -431,7 +450,11 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
|
|
||||||
for (Allele a : unclippedAlleles)
|
for (Allele a : unclippedAlleles)
|
||||||
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
|
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
|
||||||
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+ref.length()-reverseClipped-1)),
|
|
||||||
|
// the new reference length
|
||||||
|
int refLength = ref.length() - forwardClipping - reverseClipped;
|
||||||
|
|
||||||
|
return new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+forwardClipping+Math.max(refLength - 1,0))),
|
||||||
newAlleleList);
|
newAlleleList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,8 @@ import org.broad.tribble.util.AsciiLineReader;
|
||||||
import org.broad.tribble.vcf.*;
|
import org.broad.tribble.vcf.*;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
@ -20,6 +22,7 @@ import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -170,27 +173,52 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// two constants for testing
|
// test too many info fields
|
||||||
|
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HH\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
|
@Test(expected=StingException.class)
|
||||||
|
public void testCheckTooManyInfoFields() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
|
testSetup.codec.decode(twoManyInfoLine);
|
||||||
|
}
|
||||||
|
// test a regular line
|
||||||
String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
String regularLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
|
||||||
String twoManyInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2;HG=12\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCheckInfoValidation() {
|
public void testCheckInfoValidation() {
|
||||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
testSetup.codec.decode(regularLine);
|
testSetup.codec.decode(regularLine);
|
||||||
}
|
}
|
||||||
|
// test too few info lines, we don't provide the DP in this line
|
||||||
|
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
@Test
|
@Test
|
||||||
public void testCheckTwoFewInfoValidation() {
|
public void testCheckTwoFewInfoValidation() {
|
||||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
testSetup.codec.decode(twoFewInfoLine);
|
testSetup.codec.decode(twoFewInfoLine);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expected=StingException.class)
|
// test that we're getting the right genotype for a multi-base polymorphism
|
||||||
public void testCheckTwoManyInfoValidation() {
|
String MNPLine = "20\t14370\trs6054257\tGG\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
|
@Test
|
||||||
|
public void testMNPValidation() {
|
||||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
testSetup.codec.decode(twoManyInfoLine);
|
VariantContext vc = (VariantContext)testSetup.codec.decode(MNPLine);
|
||||||
|
Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||||
|
Assert.assertTrue(genotypes.containsKey("NA00003"));
|
||||||
|
Genotype g = genotypes.get("NA00003");
|
||||||
|
Assert.assertTrue("Expected AT genotype, saw = " + g.getAllele(0),"AT".equals(g.getAllele(0).toString()));
|
||||||
|
Assert.assertTrue(vc.getType()== VariantContext.Type.MNP);
|
||||||
|
}
|
||||||
|
|
||||||
|
// test that we're getting the right genotype for what appears to be a multi-base polymorphism, but is really just a SNP
|
||||||
|
String MNPLine2 = "20\t14370\trs6054257\tGT\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
|
@Test
|
||||||
|
public void testMNP2Validation() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
|
VariantContext vc = (VariantContext)testSetup.codec.decode(MNPLine2);
|
||||||
|
Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||||
|
Assert.assertTrue(genotypes.containsKey("NA00003"));
|
||||||
|
Genotype g = genotypes.get("NA00003");
|
||||||
|
Assert.assertTrue("Expected A genotype, saw = " + g.getAllele(0),"A".equals(g.getAllele(0).toString()));
|
||||||
|
Assert.assertTrue(vc.getType()== VariantContext.Type.SNP);
|
||||||
}
|
}
|
||||||
|
|
||||||
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
||||||
|
|
@ -218,8 +246,9 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
try {
|
try {
|
||||||
testSetup.codec.decode(line);
|
testSetup.codec.decode(line);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
System.err.println(e.getMessage() + " -> " + line);
|
//System.err.println(e.getMessage() + " -> " + line);
|
||||||
System.err.println(line);
|
//System.err.println(line);
|
||||||
|
Assert.fail("Bad record from line " + line + " message = " + e.getMessage());
|
||||||
badRecordCount++;
|
badRecordCount++;
|
||||||
}
|
}
|
||||||
line = reader.readLine();
|
line = reader.readLine();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue