When constructing VariantContexts from symbolic alleles, check for the END tag in the INFO field; if present, set the stop position of the VC accordingly. Added integration test to ensure that this is working properly for use with -L intervals.

This commit is contained in:
Eric Banks 2012-04-04 10:57:05 -04:00
parent 1248f9025d
commit 337ff7887a
3 changed files with 55 additions and 17 deletions

View File

@ -83,12 +83,12 @@ public final class IntervalBinding<T extends Feature> {
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
if ( codec instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
try {
FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
AsciiLineReader lineReader = new AsciiLineReader(fis);
final FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
final AsciiLineReader lineReader = new AsciiLineReader(fis);
codec.readHeader(lineReader);
String line = lineReader.readLine();
while ( line != null ) {

View File

@ -44,6 +44,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
// todo: make this thread safe?
protected String[] parts = null;
protected String[] genotypeParts = null;
protected final String[] locParts = new String[6];
// for performance we cache the hashmap of filter encodings for quick lookup
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
@ -198,8 +199,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
// our header cannot be null, we need the genotype sample names and counts
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
final String[] locParts = new String[6];
int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
final int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
if ( nParts != 6 )
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
@ -216,7 +216,23 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
// ref alleles don't need to be single bases for monomorphic sites
if ( alleles.size() == 1 ) {
stop = start + alleles.get(0).length() - 1;
} else if ( !isSingleNucleotideEvent(alleles) ) {
}
// we need to parse the INFO field to check for an END tag if it's a symbolic allele
else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() ) {
final String[] extraParts = new String[4];
final int nExtraParts = ParsingUtils.split(locParts[5], extraParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
if ( nExtraParts < 3 )
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
final Map<String, Object> attrs = parseInfo(extraParts[2]);
try {
stop = attrs.containsKey(VCFConstants.END_KEY) ? Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()) : start;
} catch (Exception e) {
throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo);
}
}
// handle multi-positional events
else if ( !isSingleNucleotideEvent(alleles) ) {
stop = clipAlleles(start, ref, alleles, null, lineNo);
}
@ -306,22 +322,33 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
String alts = getCachedString(parts[4].toUpperCase());
builder.log10PError(parseQual(parts[5]));
builder.filters(parseFilters(getCachedString(parts[6])));
builder.attributes(parseInfo(parts[7]));
final Map<String, Object> attrs = parseInfo(parts[7]);
builder.attributes(attrs);
// get our alleles, filters, and setup an attribute map
List<Allele> alleles = parseAlleles(ref, alts, lineNo);
// find out our current location, and clip the alleles down to their minimum length
int loc = pos;
int stop = pos;
// ref alleles don't need to be single bases for monomorphic sites
if ( alleles.size() == 1 ) {
loc = pos + alleles.get(0).length() - 1;
} else if ( !isSingleNucleotideEvent(alleles) ) {
stop = pos + alleles.get(0).length() - 1;
}
// we need to parse the INFO field to check for an END tag if it's a symbolic allele
else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() && attrs.containsKey(VCFConstants.END_KEY) ) {
try {
stop = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString());
} catch (Exception e) {
generateException("the END value in the INFO field is not valid");
}
}
// handle multi-positional events
else if ( !isSingleNucleotideEvent(alleles) ) {
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
stop = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
alleles = newAlleles;
}
builder.stop(loc);
builder.stop(stop);
builder.alleles(alleles);
// do we have genotyping data
@ -345,7 +372,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
generateException(e.getMessage());
}
return vc;
}

View File

@ -229,7 +229,7 @@ public class IntervalIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testEmptyVCF() {
String md5 = "";
String md5 = "897316929176464ebc9ad085f31e7284";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CountLoci" +
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
@ -238,12 +238,12 @@ public class IntervalIntegrationTest extends WalkerTest {
" -L " + validationDataLocation + "intervalTest.empty.vcf",
1, // just one output file
Arrays.asList(md5));
executeTest("testEmptyVCFError", spec);
executeTest("testEmptyVCFWarning", spec);
}
@Test(enabled = true)
public void testIncludeExcludeIsTheSame() {
String md5 = "";
String md5 = "897316929176464ebc9ad085f31e7284";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CountLoci" +
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
@ -256,5 +256,17 @@ public class IntervalIntegrationTest extends WalkerTest {
executeTest("testIncludeExcludeIsTheSame", spec);
}
@Test(enabled = true)
public void testSymbolicAlleles() {
String md5 = "52745056d2fd5904857bbd4984c08098";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CountLoci" +
" -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam" +
" -R " + b36KGReference +
" -o %s" +
" -L " + validationDataLocation + "symbolic_alleles_1.vcf",
1, // just one output file
Arrays.asList(md5));
executeTest("testSymbolicAlleles", spec);
}
}