When constructing VariantContexts from symbolic alleles, check for the END tag in the INFO field; if present, set the stop position of the VC accordingly. Added integration test to ensure that this is working properly for use with -L intervals.
This commit is contained in:
parent
1248f9025d
commit
337ff7887a
|
|
@ -83,12 +83,12 @@ public final class IntervalBinding<T extends Feature> {
|
||||||
|
|
||||||
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
|
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
|
||||||
|
|
||||||
FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
|
final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
|
||||||
if ( codec instanceof ReferenceDependentFeatureCodec )
|
if ( codec instanceof ReferenceDependentFeatureCodec )
|
||||||
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
|
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
|
||||||
try {
|
try {
|
||||||
FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
|
final FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
|
||||||
AsciiLineReader lineReader = new AsciiLineReader(fis);
|
final AsciiLineReader lineReader = new AsciiLineReader(fis);
|
||||||
codec.readHeader(lineReader);
|
codec.readHeader(lineReader);
|
||||||
String line = lineReader.readLine();
|
String line = lineReader.readLine();
|
||||||
while ( line != null ) {
|
while ( line != null ) {
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
// todo: make this thread safe?
|
// todo: make this thread safe?
|
||||||
protected String[] parts = null;
|
protected String[] parts = null;
|
||||||
protected String[] genotypeParts = null;
|
protected String[] genotypeParts = null;
|
||||||
|
protected final String[] locParts = new String[6];
|
||||||
|
|
||||||
// for performance we cache the hashmap of filter encodings for quick lookup
|
// for performance we cache the hashmap of filter encodings for quick lookup
|
||||||
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
|
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
|
||||||
|
|
@ -198,8 +199,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
// our header cannot be null, we need the genotype sample names and counts
|
// our header cannot be null, we need the genotype sample names and counts
|
||||||
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
|
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
|
||||||
|
|
||||||
final String[] locParts = new String[6];
|
final int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
|
||||||
int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
|
|
||||||
|
|
||||||
if ( nParts != 6 )
|
if ( nParts != 6 )
|
||||||
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
|
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
|
||||||
|
|
@ -216,7 +216,23 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
// ref alleles don't need to be single bases for monomorphic sites
|
// ref alleles don't need to be single bases for monomorphic sites
|
||||||
if ( alleles.size() == 1 ) {
|
if ( alleles.size() == 1 ) {
|
||||||
stop = start + alleles.get(0).length() - 1;
|
stop = start + alleles.get(0).length() - 1;
|
||||||
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
}
|
||||||
|
// we need to parse the INFO field to check for an END tag if it's a symbolic allele
|
||||||
|
else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() ) {
|
||||||
|
final String[] extraParts = new String[4];
|
||||||
|
final int nExtraParts = ParsingUtils.split(locParts[5], extraParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
|
||||||
|
if ( nExtraParts < 3 )
|
||||||
|
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
|
||||||
|
|
||||||
|
final Map<String, Object> attrs = parseInfo(extraParts[2]);
|
||||||
|
try {
|
||||||
|
stop = attrs.containsKey(VCFConstants.END_KEY) ? Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()) : start;
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// handle multi-positional events
|
||||||
|
else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||||
stop = clipAlleles(start, ref, alleles, null, lineNo);
|
stop = clipAlleles(start, ref, alleles, null, lineNo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -306,22 +322,33 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
String alts = getCachedString(parts[4].toUpperCase());
|
String alts = getCachedString(parts[4].toUpperCase());
|
||||||
builder.log10PError(parseQual(parts[5]));
|
builder.log10PError(parseQual(parts[5]));
|
||||||
builder.filters(parseFilters(getCachedString(parts[6])));
|
builder.filters(parseFilters(getCachedString(parts[6])));
|
||||||
builder.attributes(parseInfo(parts[7]));
|
final Map<String, Object> attrs = parseInfo(parts[7]);
|
||||||
|
builder.attributes(attrs);
|
||||||
|
|
||||||
// get our alleles, filters, and setup an attribute map
|
// get our alleles, filters, and setup an attribute map
|
||||||
List<Allele> alleles = parseAlleles(ref, alts, lineNo);
|
List<Allele> alleles = parseAlleles(ref, alts, lineNo);
|
||||||
|
|
||||||
// find out our current location, and clip the alleles down to their minimum length
|
// find out our current location, and clip the alleles down to their minimum length
|
||||||
int loc = pos;
|
int stop = pos;
|
||||||
// ref alleles don't need to be single bases for monomorphic sites
|
// ref alleles don't need to be single bases for monomorphic sites
|
||||||
if ( alleles.size() == 1 ) {
|
if ( alleles.size() == 1 ) {
|
||||||
loc = pos + alleles.get(0).length() - 1;
|
stop = pos + alleles.get(0).length() - 1;
|
||||||
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
}
|
||||||
|
// we need to parse the INFO field to check for an END tag if it's a symbolic allele
|
||||||
|
else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() && attrs.containsKey(VCFConstants.END_KEY) ) {
|
||||||
|
try {
|
||||||
|
stop = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString());
|
||||||
|
} catch (Exception e) {
|
||||||
|
generateException("the END value in the INFO field is not valid");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// handle multi-positional events
|
||||||
|
else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||||
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
|
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
|
||||||
loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
|
stop = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
|
||||||
alleles = newAlleles;
|
alleles = newAlleles;
|
||||||
}
|
}
|
||||||
builder.stop(loc);
|
builder.stop(stop);
|
||||||
builder.alleles(alleles);
|
builder.alleles(alleles);
|
||||||
|
|
||||||
// do we have genotyping data
|
// do we have genotyping data
|
||||||
|
|
@ -345,7 +372,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
generateException(e.getMessage());
|
generateException(e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return vc;
|
return vc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -229,7 +229,7 @@ public class IntervalIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testEmptyVCF() {
|
public void testEmptyVCF() {
|
||||||
String md5 = "";
|
String md5 = "897316929176464ebc9ad085f31e7284";
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T CountLoci" +
|
"-T CountLoci" +
|
||||||
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
|
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
|
||||||
|
|
@ -238,12 +238,12 @@ public class IntervalIntegrationTest extends WalkerTest {
|
||||||
" -L " + validationDataLocation + "intervalTest.empty.vcf",
|
" -L " + validationDataLocation + "intervalTest.empty.vcf",
|
||||||
1, // just one output file
|
1, // just one output file
|
||||||
Arrays.asList(md5));
|
Arrays.asList(md5));
|
||||||
executeTest("testEmptyVCFError", spec);
|
executeTest("testEmptyVCFWarning", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testIncludeExcludeIsTheSame() {
|
public void testIncludeExcludeIsTheSame() {
|
||||||
String md5 = "";
|
String md5 = "897316929176464ebc9ad085f31e7284";
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T CountLoci" +
|
"-T CountLoci" +
|
||||||
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
|
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
|
||||||
|
|
@ -256,5 +256,17 @@ public class IntervalIntegrationTest extends WalkerTest {
|
||||||
executeTest("testIncludeExcludeIsTheSame", spec);
|
executeTest("testIncludeExcludeIsTheSame", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testSymbolicAlleles() {
|
||||||
|
String md5 = "52745056d2fd5904857bbd4984c08098";
|
||||||
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
"-T CountLoci" +
|
||||||
|
" -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam" +
|
||||||
|
" -R " + b36KGReference +
|
||||||
|
" -o %s" +
|
||||||
|
" -L " + validationDataLocation + "symbolic_alleles_1.vcf",
|
||||||
|
1, // just one output file
|
||||||
|
Arrays.asList(md5));
|
||||||
|
executeTest("testSymbolicAlleles", spec);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue