When constructing VariantContexts from symbolic alleles, check for the END tag in the INFO field; if present, set the stop position of the VC accordingly. Added integration test to ensure that this is working properly for use with -L intervals.

This commit is contained in:
Eric Banks 2012-04-04 10:57:05 -04:00
parent 1248f9025d
commit 337ff7887a
3 changed files with 55 additions and 17 deletions

View File

@ -83,12 +83,12 @@ public final class IntervalBinding<T extends Feature> {
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
if ( codec instanceof ReferenceDependentFeatureCodec ) if ( codec instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser()); ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
try { try {
FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource())); final FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
AsciiLineReader lineReader = new AsciiLineReader(fis); final AsciiLineReader lineReader = new AsciiLineReader(fis);
codec.readHeader(lineReader); codec.readHeader(lineReader);
String line = lineReader.readLine(); String line = lineReader.readLine();
while ( line != null ) { while ( line != null ) {

View File

@ -44,6 +44,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
// todo: make this thread safe? // todo: make this thread safe?
protected String[] parts = null; protected String[] parts = null;
protected String[] genotypeParts = null; protected String[] genotypeParts = null;
protected final String[] locParts = new String[6];
// for performance we cache the hashmap of filter encodings for quick lookup // for performance we cache the hashmap of filter encodings for quick lookup
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>(); protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
@ -198,8 +199,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
// our header cannot be null, we need the genotype sample names and counts // our header cannot be null, we need the genotype sample names and counts
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
final String[] locParts = new String[6]; final int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
if ( nParts != 6 ) if ( nParts != 6 )
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
@ -216,7 +216,23 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
// ref alleles don't need to be single bases for monomorphic sites // ref alleles don't need to be single bases for monomorphic sites
if ( alleles.size() == 1 ) { if ( alleles.size() == 1 ) {
stop = start + alleles.get(0).length() - 1; stop = start + alleles.get(0).length() - 1;
} else if ( !isSingleNucleotideEvent(alleles) ) { }
// we need to parse the INFO field to check for an END tag if it's a symbolic allele
else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() ) {
final String[] extraParts = new String[4];
final int nExtraParts = ParsingUtils.split(locParts[5], extraParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
if ( nExtraParts < 3 )
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
final Map<String, Object> attrs = parseInfo(extraParts[2]);
try {
stop = attrs.containsKey(VCFConstants.END_KEY) ? Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()) : start;
} catch (Exception e) {
throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo);
}
}
// handle multi-positional events
else if ( !isSingleNucleotideEvent(alleles) ) {
stop = clipAlleles(start, ref, alleles, null, lineNo); stop = clipAlleles(start, ref, alleles, null, lineNo);
} }
@ -306,22 +322,33 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
String alts = getCachedString(parts[4].toUpperCase()); String alts = getCachedString(parts[4].toUpperCase());
builder.log10PError(parseQual(parts[5])); builder.log10PError(parseQual(parts[5]));
builder.filters(parseFilters(getCachedString(parts[6]))); builder.filters(parseFilters(getCachedString(parts[6])));
builder.attributes(parseInfo(parts[7])); final Map<String, Object> attrs = parseInfo(parts[7]);
builder.attributes(attrs);
// get our alleles, filters, and setup an attribute map // get our alleles, filters, and setup an attribute map
List<Allele> alleles = parseAlleles(ref, alts, lineNo); List<Allele> alleles = parseAlleles(ref, alts, lineNo);
// find out our current location, and clip the alleles down to their minimum length // find out our current location, and clip the alleles down to their minimum length
int loc = pos; int stop = pos;
// ref alleles don't need to be single bases for monomorphic sites // ref alleles don't need to be single bases for monomorphic sites
if ( alleles.size() == 1 ) { if ( alleles.size() == 1 ) {
loc = pos + alleles.get(0).length() - 1; stop = pos + alleles.get(0).length() - 1;
} else if ( !isSingleNucleotideEvent(alleles) ) { }
// we need to parse the INFO field to check for an END tag if it's a symbolic allele
else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() && attrs.containsKey(VCFConstants.END_KEY) ) {
try {
stop = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString());
} catch (Exception e) {
generateException("the END value in the INFO field is not valid");
}
}
// handle multi-positional events
else if ( !isSingleNucleotideEvent(alleles) ) {
ArrayList<Allele> newAlleles = new ArrayList<Allele>(); ArrayList<Allele> newAlleles = new ArrayList<Allele>();
loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); stop = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
alleles = newAlleles; alleles = newAlleles;
} }
builder.stop(loc); builder.stop(stop);
builder.alleles(alleles); builder.alleles(alleles);
// do we have genotyping data // do we have genotyping data
@ -345,7 +372,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
generateException(e.getMessage()); generateException(e.getMessage());
} }
return vc; return vc;
} }

View File

@ -229,7 +229,7 @@ public class IntervalIntegrationTest extends WalkerTest {
@Test(enabled = true) @Test(enabled = true)
public void testEmptyVCF() { public void testEmptyVCF() {
String md5 = ""; String md5 = "897316929176464ebc9ad085f31e7284";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CountLoci" + "-T CountLoci" +
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
@ -238,12 +238,12 @@ public class IntervalIntegrationTest extends WalkerTest {
" -L " + validationDataLocation + "intervalTest.empty.vcf", " -L " + validationDataLocation + "intervalTest.empty.vcf",
1, // just one output file 1, // just one output file
Arrays.asList(md5)); Arrays.asList(md5));
executeTest("testEmptyVCFError", spec); executeTest("testEmptyVCFWarning", spec);
} }
@Test(enabled = true) @Test(enabled = true)
public void testIncludeExcludeIsTheSame() { public void testIncludeExcludeIsTheSame() {
String md5 = ""; String md5 = "897316929176464ebc9ad085f31e7284";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CountLoci" + "-T CountLoci" +
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
@ -256,5 +256,17 @@ public class IntervalIntegrationTest extends WalkerTest {
executeTest("testIncludeExcludeIsTheSame", spec); executeTest("testIncludeExcludeIsTheSame", spec);
} }
@Test(enabled = true)
public void testSymbolicAlleles() {
String md5 = "52745056d2fd5904857bbd4984c08098";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CountLoci" +
" -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam" +
" -R " + b36KGReference +
" -o %s" +
" -L " + validationDataLocation + "symbolic_alleles_1.vcf",
1, // just one output file
Arrays.asList(md5));
executeTest("testSymbolicAlleles", spec);
}
} }