diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java index 0c6096e0c..d1d616c97 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -83,12 +83,12 @@ public final class IntervalBinding { // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files - FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); + final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); if ( codec instanceof ReferenceDependentFeatureCodec ) ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser()); try { - FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource())); - AsciiLineReader lineReader = new AsciiLineReader(fis); + final FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource())); + final AsciiLineReader lineReader = new AsciiLineReader(fis); codec.readHeader(lineReader); String line = lineReader.readLine(); while ( line != null ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 8180eba30..c8f8de770 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -44,6 +44,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { // todo: make this thread safe? protected String[] parts = null; protected String[] genotypeParts = null; + protected final String[] locParts = new String[6]; // for performance we cache the hashmap of filter encodings for quick lookup protected HashMap> filterHash = new HashMap>(); @@ -198,8 +199,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { // our header cannot be null, we need the genotype sample names and counts if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); - final String[] locParts = new String[6]; - int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + final int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); if ( nParts != 6 ) throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); @@ -216,7 +216,23 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { // ref alleles don't need to be single bases for monomorphic sites if ( alleles.size() == 1 ) { stop = start + alleles.get(0).length() - 1; - } else if ( !isSingleNucleotideEvent(alleles) ) { + } + // we need to parse the INFO field to check for an END tag if it's a symbolic allele + else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() ) { + final String[] extraParts = new String[4]; + final int nExtraParts = ParsingUtils.split(locParts[5], extraParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + if ( nExtraParts < 3 ) + throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); + + final Map attrs = parseInfo(extraParts[2]); + try { + stop = attrs.containsKey(VCFConstants.END_KEY) ? Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()) : start; + } catch (Exception e) { + throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo); + } + } + // handle multi-positional events + else if ( !isSingleNucleotideEvent(alleles) ) { stop = clipAlleles(start, ref, alleles, null, lineNo); } @@ -306,22 +322,33 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { String alts = getCachedString(parts[4].toUpperCase()); builder.log10PError(parseQual(parts[5])); builder.filters(parseFilters(getCachedString(parts[6]))); - builder.attributes(parseInfo(parts[7])); + final Map attrs = parseInfo(parts[7]); + builder.attributes(attrs); // get our alleles, filters, and setup an attribute map List alleles = parseAlleles(ref, alts, lineNo); // find out our current location, and clip the alleles down to their minimum length - int loc = pos; + int stop = pos; // ref alleles don't need to be single bases for monomorphic sites if ( alleles.size() == 1 ) { - loc = pos + alleles.get(0).length() - 1; - } else if ( !isSingleNucleotideEvent(alleles) ) { + stop = pos + alleles.get(0).length() - 1; + } + // we need to parse the INFO field to check for an END tag if it's a symbolic allele + else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() && attrs.containsKey(VCFConstants.END_KEY) ) { + try { + stop = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()); + } catch (Exception e) { + generateException("the END value in the INFO field is not valid"); + } + } + // handle multi-positional events + else if ( !isSingleNucleotideEvent(alleles) ) { ArrayList newAlleles = new ArrayList(); - loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); + stop = clipAlleles(pos, ref, alleles, newAlleles, lineNo); alleles = newAlleles; } - builder.stop(loc); + builder.stop(stop); builder.alleles(alleles); // do we have genotyping data @@ -345,7 +372,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { generateException(e.getMessage()); } - return vc; } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index a8364419d..756966e97 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -229,7 +229,7 @@ public class IntervalIntegrationTest extends WalkerTest { @Test(enabled = true) public void testEmptyVCF() { - String md5 = ""; + String md5 = "897316929176464ebc9ad085f31e7284"; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T CountLoci" + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + @@ -238,12 +238,12 @@ public class IntervalIntegrationTest extends WalkerTest { " -L " + validationDataLocation + "intervalTest.empty.vcf", 1, // just one output file Arrays.asList(md5)); - executeTest("testEmptyVCFError", spec); + executeTest("testEmptyVCFWarning", spec); } @Test(enabled = true) public void testIncludeExcludeIsTheSame() { - String md5 = ""; + String md5 = "897316929176464ebc9ad085f31e7284"; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T CountLoci" + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + @@ -256,5 +256,17 @@ public class IntervalIntegrationTest extends WalkerTest { executeTest("testIncludeExcludeIsTheSame", spec); } - + @Test(enabled = true) + public void testSymbolicAlleles() { + String md5 = "52745056d2fd5904857bbd4984c08098"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam" + + " -R " + b36KGReference + + " -o %s" + + " -L " + validationDataLocation + "symbolic_alleles_1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testSymbolicAlleles", spec); + } }