Add code to allow reference alleles with 'N' in VariantContext, but not in the alternate allele(s). Also more updates to the VCF 4 code (fixed parsing for files without genotypes).

This check-in will temperarly break the build (I need to see if Bamboo is correctly returning the log file for the failed builds).  

Will be fixed once Bamboo starts building.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3609 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-22 18:26:37 +00:00
parent 824c2bbac0
commit a6d3e4bd47
5 changed files with 162 additions and 84 deletions

View File

@ -106,7 +106,7 @@ public class Allele implements Comparable<Allele> {
this.isRef = isRef;
this.bases = bases;
if ( ! acceptableAlleleBases(bases) )
if ( ! acceptableAlleleBases(bases,isRef) )
throw new IllegalArgumentException("Unexpected base in allele bases " + new String(bases));
}
@ -186,24 +186,29 @@ public class Allele implements Comparable<Allele> {
/**
* @param bases bases representing an allele
* @param reference is this the reference allele
* @return true if the bases represent the well formatted allele
*/
public static boolean acceptableAlleleBases(String bases) {
return acceptableAlleleBases(bases.getBytes());
public static boolean acceptableAlleleBases(String bases, boolean reference) {
return acceptableAlleleBases(bases.getBytes(),reference);
}
/**
* @param bases bases representing an allele
* @param reference are we the reference (we allow n's in the reference allele)
* @return true if the bases represent the well formatted allele
*/
public static boolean acceptableAlleleBases(byte[] bases) {
public static boolean acceptableAlleleBases(byte[] bases, boolean reference) {
if ( wouldBeNullAllele(bases) || wouldBeNoCallAllele(bases) )
return true;
for ( int i = 0; i < bases.length; i++ ) {
switch (bases[i]) {
case 'A': case 'C': case 'G': case 'T': break;
default: return false;
case 'N': if(!reference)
return false; break;
default:
return false;
}
// if ( ! BaseUtils.isRegularBase(bases[i]) ) {
// return false;

View File

@ -94,7 +94,7 @@ public class VariantContextAdaptors {
private static class DBSnpAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input, ReferenceContext ref) {
DbSNPFeature dbsnp = (DbSNPFeature)input;
if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp)) )
if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp),true) )
return null;
Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true);
@ -105,7 +105,7 @@ public class VariantContextAdaptors {
// add all of the alt alleles
for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
if ( ! Allele.acceptableAlleleBases(alt,false) ) {
//System.out.printf("Excluding dbsnp record %s%n", dbsnp);
return null;
}
@ -131,7 +131,7 @@ public class VariantContextAdaptors {
private static VariantContext vcfToVariantContext(String name, VCFRecord vcf, ReferenceContext ref) {
if ( vcf.isReference() || vcf.isSNP() || vcf.isIndel() ) {
// add the reference allele
if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) {
if ( ! Allele.acceptableAlleleBases(vcf.getReference(),true) ) {
System.out.printf("Excluding vcf record %s%n", vcf);
return null;
}
@ -146,7 +146,7 @@ public class VariantContextAdaptors {
alleles.add(refAllele);
for ( VCFGenotypeEncoding alt : vcf.getAlternateAlleles() ) {
if ( ! Allele.acceptableAlleleBases(alt.getBases()) ) {
if ( ! Allele.acceptableAlleleBases(alt.getBases(),false) ) {
//System.out.printf("Excluding vcf record %s%n", vcf);
return null;
}
@ -531,7 +531,7 @@ public class VariantContextAdaptors {
VariantContext convert(String name, Object input, ReferenceContext ref) {
RodGLF glf = (RodGLF)input;
if ( ! Allele.acceptableAlleleBases(glf.getReference()) )
if ( ! Allele.acceptableAlleleBases(glf.getReference(),true) )
return null;
Allele refAllele = Allele.create(glf.getReference(), true);
@ -543,7 +543,7 @@ public class VariantContextAdaptors {
// add all of the alt alleles
for ( String alt : glf.getAlternateAlleleList() ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
if ( ! Allele.acceptableAlleleBases(alt,false) ) {
return null;
}
Allele allele = Allele.create(alt, false);
@ -603,7 +603,7 @@ public class VariantContextAdaptors {
*/
VariantContext convert(String name, Object input, ReferenceContext ref) {
GeliTextFeature geli = (GeliTextFeature)input;
if ( ! Allele.acceptableAlleleBases(String.valueOf(geli.getRefBase())) )
if ( ! Allele.acceptableAlleleBases(String.valueOf(geli.getRefBase()),true) )
return null;
Allele refAllele = Allele.create(String.valueOf(geli.getRefBase()), true);
@ -614,7 +614,7 @@ public class VariantContextAdaptors {
List<Allele> genotypeAlleles = new ArrayList<Allele>();
// add all of the alt alleles
for ( char alt : geli.getGenotype().toString().toCharArray() ) {
if ( ! Allele.acceptableAlleleBases(String.valueOf(alt)) ) {
if ( ! Allele.acceptableAlleleBases(String.valueOf(alt),false) ) {
return null;
}
Allele allele = Allele.create(String.valueOf(alt), false);
@ -670,7 +670,7 @@ public class VariantContextAdaptors {
*/
VariantContext convert(String name, Object input, ReferenceContext ref) {
GenotypeLikelihoods geli = ((rodGELI)input).getGeliRecord();
if ( ! Allele.acceptableAlleleBases(String.valueOf((char)geli.getReferenceBase())) )
if ( ! Allele.acceptableAlleleBases(String.valueOf((char)geli.getReferenceBase()),true) )
return null;
Allele refAllele = Allele.create(String.valueOf((char)geli.getReferenceBase()), true);
@ -679,14 +679,14 @@ public class VariantContextAdaptors {
alleles.add(refAllele);
// add the two alt alleles
if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele1()))) {
if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele1()),false)) {
return null;
}
Allele allele1 = Allele.create(String.valueOf((char) geli.getBestGenotype().getAllele1()), false);
if (!alleles.contains(allele1) && !allele1.equals(refAllele, true)) alleles.add(allele1);
// add the two alt alleles
if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele2()))) {
if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele2()),false)) {
return null;
}
Allele allele2 = Allele.create(String.valueOf((char) geli.getBestGenotype().getAllele2()), false);

View File

@ -28,31 +28,29 @@ public class VCF4Codec implements FeatureCodec {
// we have to store the list of strings that make up the header until they're needed
private VCFHeader header = null;
// used to subtract from the
// used to convert the index of the alternate allele in genotypes to a integer index
private static int ZERO_CHAR = (byte)'0';
// a mapping of the allele
private static Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
//private static String[] CachedGTKey = new String[100];
// cache the genotyope values
private static String[] CachedGTValues = new String[100];
// for performance testing purposes
public static boolean parseGenotypesToo = true;
// for performance testing purposes
public static boolean validate = true;
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
// todo: make this thread safe?
private String[] parts = null;
// for performance we cache the hashmap of filter encodings for quick lookup
private HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
// a set of the genotype keys? TODO: rename to something better
private String[] GTKeys = new String[100];
// a set of the genotype keys?
private String[] genotypeKeyArray = new String[100];
// a list of the info fields, filter fields, and format fields
// a list of the info fields, filter fields, and format fields, for quick lookup to validate against
ArrayList<String> infoFields = new ArrayList<String>();
ArrayList<String> formatFields = new ArrayList<String>();
ArrayList<String> filterFields = new ArrayList<String>();
@ -60,6 +58,9 @@ public class VCF4Codec implements FeatureCodec {
// do we want to validate the info, format, and filter fields
private final boolean validateFromHeader = true;
// we store a name to give to each of the variant contexts we emit
private String name = "Unkown";
/**
* this method is a big hack, since I haven't gotten to updating the VCF header for the 4.0 updates
* @param reader the line reader to take header lines from
@ -76,27 +77,7 @@ public class VCF4Codec implements FeatureCodec {
headerStrings.add(line);
}
else if (line.startsWith("#")) {
headerStrings.add(line);
header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0);
// load the parsing fields
Set<VCFHeaderLine> headerLines = header.getMetaData();
// setup our look-up lists for validation
for (VCFHeaderLine hl : headerLines) {
if (hl.getClass() == VCFFilterHeaderLine.class)
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
if (hl.getClass() == VCFFormatHeaderLine.class)
this.formatFields.add(((VCFFormatHeaderLine)hl).getmName());
if (hl.getClass() == VCFInfoHeaderLine.class)
this.infoFields.add(((VCFInfoHeaderLine)hl).getmName());
}
// sort the lists
Collections.sort(filterFields);
Collections.sort(formatFields);
Collections.sort(infoFields);
return headerStrings.size();
return createHeader(headerStrings, line);
}
else {
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
@ -110,6 +91,41 @@ public class VCF4Codec implements FeatureCodec {
}
/**
* create a VCF header
* @param headerStrings a list of strings that represent all the ## entries
* @param line the single # line (column names)
* @return the count of header lines
*/
private int createHeader(List<String> headerStrings, String line) {
headerStrings.add(line);
header = VCFReaderUtils.createHeader(headerStrings, VCFHeaderVersion.VCF4_0);
// load the parsing fields
Set<VCFHeaderLine> headerLines = header.getMetaData();
// setup our look-up lists for validation
for (VCFHeaderLine hl : headerLines) {
if (hl.getClass() == VCFFilterHeaderLine.class)
this.filterFields.add(((VCFFilterHeaderLine)hl).getmName());
if (hl.getClass() == VCFFormatHeaderLine.class)
this.formatFields.add(((VCFFormatHeaderLine)hl).getmName());
if (hl.getClass() == VCFInfoHeaderLine.class)
this.infoFields.add(((VCFInfoHeaderLine)hl).getmName());
}
// sort the lists so we can binary search them later on
Collections.sort(filterFields);
Collections.sort(formatFields);
Collections.sort(infoFields);
return headerStrings.size();
}
/**
* the fast decode function
* @param line the line of text for the record
* @return a feature, (not guaranteed complete) that has the correct start and stop
*/
public Feature decodeLoc(String line) {
return decode(line);
}
@ -205,6 +221,11 @@ public class VCF4Codec implements FeatureCodec {
return attributes;
}
/**
* validate the attributes against the stored fields of the appopriate type
* @param attributes the list of fields to check for inclusion against the field array
* @param fields the master list; all attributes must be in this list to validate
*/
private void validateFields(Set<String> attributes, List<String> fields) {
// validate the info fields
if (validateFromHeader) {
@ -234,15 +255,15 @@ public class VCF4Codec implements FeatureCodec {
private List<Allele> parseAlleles(String ref, String alts) {
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
// ref
checkAllele(ref);
checkAllele(ref, true);
Allele refAllele = Allele.create(ref, true);
alleles.add(refAllele);
if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split
parseSingleAllele(alleles, alts);
parseSingleAllele(alleles, alts, false);
else
for ( String alt : Utils.split(alts, ",") )
parseSingleAllele(alleles, alt);
parseSingleAllele(alleles, alt, false);
return alleles;
}
@ -250,9 +271,10 @@ public class VCF4Codec implements FeatureCodec {
/**
* check to make sure the allele is an acceptable allele
* @param allele the allele to check
* @param isRef are we the reference allele?
*/
private static void checkAllele(String allele) {
if ( ! Allele.acceptableAlleleBases(allele) ) {
private static void checkAllele(String allele,boolean isRef) {
if ( ! Allele.acceptableAlleleBases(allele,isRef) ) {
throw new StingException("Unparsable vcf record with allele " + allele);
}
}
@ -261,9 +283,10 @@ public class VCF4Codec implements FeatureCodec {
* parse a single allele, given the allele list
* @param alleles the alleles available
* @param alt the allele to parse
* @param isRef are we the reference allele?
*/
private void parseSingleAllele(List<Allele> alleles, String alt) {
checkAllele(alt);
private void parseSingleAllele(List<Allele> alleles, String alt, boolean isRef) {
checkAllele(alt,isRef);
Allele allele = Allele.create(alt, false);
if ( ! allele.isNoCall() )
@ -304,10 +327,12 @@ public class VCF4Codec implements FeatureCodec {
/**
* parse out the VCF line
*
* @param parts the parts split up
* @return a variant context object
*/
private VariantContext parseVCFLine(String[] parts) {
// parse out the required fields
String contig = parts[0];
long pos = Long.valueOf(parts[1]);
String id = parts[2];
@ -316,26 +341,31 @@ public class VCF4Codec implements FeatureCodec {
Double qual = parseQual(parts[5]);
String filter = parts[6];
String info = parts[7];
String GT = parts[8];
int genotypesStart = 9;
List<Allele> alleles = parseAlleles(ref, alts);
List<Allele> alleles = parseAlleles(ref, alts);
Set<String> filters = parseFilters(filter);
Map<String, Object> attributes = parseInfo(info, id);
// parse genotypes
int nGTKeys = ParsingUtils.split(GT, GTKeys, ':');
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
Iterator<String> iter = header.getGenotypeSamples().iterator();
// find out our current location, and clip the alleles down to their minimum length
Pair<GenomeLoc, List<Allele>> locAndAlleles = (ref.length() > 1) ?
clipAlleles(contig, pos, ref, alleles) :
new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
Pair<GenomeLoc,List<Allele>> locAndAlleles = (ref.length() > 1) ?
clipAlleles(contig,pos,ref,alleles) :
new Pair<GenomeLoc,List<Allele>>(GenomeLocParser.createGenomeLoc(contig,pos),alleles);
// a map to store our genotypes
Map<String, Genotype> genotypes = null;
// do we have genotyping data
if (parts.length > 8) {
String GT = parts[8];
int genotypesStart = 9;
// parse genotypes
int nGTKeys = ParsingUtils.split(GT, genotypeKeyArray, ':');
genotypes = new HashMap<String, Genotype>(Math.max(parts.length - genotypesStart, 1));
Iterator<String> iter = header.getGenotypeSamples().iterator();
if ( parseGenotypesToo ) {
alleleMap.clear();
for ( int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++ ) {
for (int genotypeOffset = genotypesStart; genotypeOffset < parts.length; genotypeOffset++) {
String sample = parts[genotypeOffset];
String[] GTValues = CachedGTValues;
ParsingUtils.split(sample, GTValues, ':');
@ -345,30 +375,30 @@ public class VCF4Codec implements FeatureCodec {
// todo -- the parsing of attributes could be made lazy for performance
Map<String, String> gtAttributes = null;
if ( nGTKeys > 1 ) {
if (nGTKeys > 1) {
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
for ( int i = 1; i < nGTKeys; i++ ) {
if ( GTKeys[i].equals("GQ") ) {
for (int i = 1; i < nGTKeys; i++) {
if (genotypeKeyArray[i].equals("GQ")) {
GTQual = parseQual(GTValues[i]);
} if ( GTKeys[i].equals("FL") ) { // deal with genotype filters here
}
if (genotypeKeyArray[i].equals("FL")) { // deal with genotype filters here
genotypeFilters.addAll(parseFilters(GTValues[i]));
} else {
gtAttributes.put(GTKeys[i], GTValues[i]);
gtAttributes.put(genotypeKeyArray[i], GTValues[i]);
}
}
// validate the format fields
validateFields(gtAttributes.keySet(),formatFields);
validateFields(gtAttributes.keySet(), formatFields);
}
boolean phased = GTKeys[0].charAt(1) == '|';
boolean phased = genotypeKeyArray[0].charAt(1) == '|';
Genotype g = new Genotype(iter.next(), genotypeAlleles, GTQual, genotypeFilters, gtAttributes, phased);
genotypes.put(g.getSampleName(), g);
}
}
// todo -- we need access to our track name to name the variant context
return new VariantContext("foo", locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
return new VariantContext(name, locAndAlleles.first, locAndAlleles.second, genotypes, qual, filters, attributes);
}
/**
@ -434,4 +464,20 @@ public class VCF4Codec implements FeatureCodec {
if (clazz != VCFHeader.class) throw new ClassCastException("expecting class " + clazz + " but VCF4Codec provides " + VCFHeader.class);
return this.header;
}
/**
* get the name of this codec
* @return our set name
*/
public String getName() {
return name;
}
/**
* set the name of this codec
* @param name
*/
public void setName(String name) {
this.name = name;
}
}

View File

@ -92,7 +92,7 @@ public class VCF4ReaderTestWalker extends RodWalker<VCFRecord,Long> {
String[] parts = new String[10000];
public void onTraversalDone(Long num){
VCF4Codec vcf4codec = new VCF4Codec();
VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
//VCF4Codec.parseGenotypesToo = splitFile == ParsingStatus.GENOTYPES;
VCF4Codec.validate = ! DontValidate;
VCFCodec vcf3codec = new VCFCodec();

View File

@ -26,7 +26,8 @@ import java.util.Set;
* test out pieces of the VCF 4 codec.
*/
public class VCF4UnitTest extends BaseTest {
File vcfFile = new File("testdata/vcfExmaple.vcf4");
File vcfGenotypeFile = new File("testdata/vcf/vcfWithGenotypes.vcf");
File vcfNoGenotypeFile = new File("testdata/vcf/vcfWithoutGenotypes.vcf");
// setup the contig ordering
@BeforeClass
@ -42,7 +43,7 @@ public class VCF4UnitTest extends BaseTest {
@Test
public void testReadBasicHeader() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
int seenRecords = 0;
@ -95,7 +96,7 @@ public class VCF4UnitTest extends BaseTest {
@Test
public void testOutputHeader() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
File tempFile = null;
try {
@ -115,9 +116,8 @@ public class VCF4UnitTest extends BaseTest {
@Test
public void testCountVCF4Records() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
AsciiLineReader reader = testSetup.getReader();
VCF4Codec codec = testSetup.getCodec();
// now parse the lines
String line = null;
@ -139,6 +139,33 @@ public class VCF4UnitTest extends BaseTest {
Assert.fail("Failed to read a line");
}
}
Assert.assertEquals(7,recordCount);
}
@Test
public void testCountVCF4RecordsWithoutGenotypes() {
TestSetup testSetup = new TestSetup().invoke(vcfNoGenotypeFile);
AsciiLineReader reader = testSetup.getReader();
// now parse the lines
String line = null;
try {
line = reader.readLine();
} catch (IOException e) {
Assert.fail("Failed to read a line");
}
// our record count
int recordCount = 0;
while (line != null) {
try {
recordCount++;
testSetup.codec.decode(line);
line = reader.readLine();
} catch (IOException e) {
Assert.fail("Failed to read a line");
}
}
Assert.assertEquals(6,recordCount);
}
@ -150,25 +177,25 @@ public class VCF4UnitTest extends BaseTest {
@Test
public void testCheckInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(regularLine);
}
@Test
public void testCheckTwoFewInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(twoFewInfoLine);
}
@Test(expected=StingException.class)
public void testCheckTwoManyInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfFile);
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(twoManyInfoLine);
}
//File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
/* File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
/*@Test
@Test
public void checkLargeVCF() {
TestSetup testSetup = new TestSetup().invoke(largeVCF);
AsciiLineReader reader = testSetup.getReader();