diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index b4cf96831..e3f9b8ee3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -116,6 +116,7 @@ public class VCFDiffableReader implements DiffableReader { if ( g.hasDP() ) gRoot.add("DP", g.getDP() ); if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD())); if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL())); + if ( g.getFilters() != null ) gRoot.add("FT", g.getFilters()); for (Map.Entry attribute : g.getExtendedAttributes().entrySet()) { if ( ! attribute.getKey().startsWith("_") ) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 06fb07f94..e106e72ac 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -332,7 +332,9 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD protected List clipAllelesIfNecessary(final int position, final String ref, final List unclippedAlleles) { - final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(position, ref, unclippedAlleles); + // the last argument of 1 allows us to safely ignore the end, because we are + // ultimately going to use the end in the record itself + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(position, ref, unclippedAlleles, 1); if ( clipped.getError() != null ) error(clipped.getError()); return clipped.getClippedAlleles(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 6302d1053..1d5869349 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -186,79 +186,19 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - - // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line - if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; - - // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); - - final int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - - if ( nParts != 6 ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); - - // get our alleles (because the end position depends on them) - final String ref = getCachedString(locParts[3].toUpperCase()); - final String alts = getCachedString(locParts[4].toUpperCase()); - final List alleles = parseAlleles(ref, alts, lineNo); - - // find out our location - int start = 0; - try { - start = Integer.valueOf(locParts[1]); - } catch (Exception e) { - generateException("the value in the POS field must be an integer but it was " + locParts[1], lineNo); - } - int stop = start; - - // ref alleles don't need to be single bases for monomorphic sites - if ( alleles.size() == 1 ) { - stop = start + alleles.get(0).length() - 1; - } - // we need to parse the INFO field to check for an END tag if it's a symbolic allele - else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() ) { - final String[] extraParts = new String[4]; - final int nExtraParts = ParsingUtils.split(locParts[5], extraParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - if ( nExtraParts < 3 ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); - - final Map attrs = parseInfo(extraParts[2]); - try { - stop = attrs.containsKey(VCFConstants.END_KEY) ? Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()) : start; - } catch (Exception e) { - throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo); - } - } else { - stop = VCFAlleleClipper.clipAlleles(start, ref, alleles).stop; - } - - return new VCFLocFeature(locParts[0], start, stop); + return decodeLine(line, false); } - private final static class VCFLocFeature implements Feature { - - final String chr; - final int start, stop; - - private VCFLocFeature(String chr, int start, int stop) { - this.chr = chr; - this.start = start; - this.stop = stop; - } - - public String getChr() { return chr; } - public int getStart() { return start; } - public int getEnd() { return stop; } - } - - /** * decode the line into a feature (VariantContext) * @param line the line * @return a VariantContext */ public VariantContext decode(String line) { + return decodeLine(line, true); + } + + private final VariantContext decodeLine(final String line, final boolean includeGenotypes) { // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; @@ -276,15 +216,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + " tokens, and saw " + nParts + " )", lineNo); - return parseVCFLine(parts); - } - - protected void generateException(String message) { - throw new UserException.MalformedVCF(message, lineNo); - } - - protected static void generateException(String message, int lineNo) { - throw new UserException.MalformedVCF(message, lineNo); + return parseVCFLine(parts, includeGenotypes); } /** @@ -293,7 +225,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec * @param parts the parts split up * @return a variant context object */ - private VariantContext parseVCFLine(String[] parts) { + private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) { VariantContextBuilder builder = new VariantContextBuilder(); builder.source(getName()); @@ -315,8 +247,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec else builder.id(parts[2]); - String ref = getCachedString(parts[3].toUpperCase()); - String alts = getCachedString(parts[4].toUpperCase()); + final String ref = getCachedString(parts[3].toUpperCase()); + final String alts = getCachedString(parts[4].toUpperCase()); builder.log10PError(parseQual(parts[5])); final List filters = parseFilters(getCachedString(parts[6])); @@ -325,33 +257,11 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec builder.attributes(attrs); // get our alleles, filters, and setup an attribute map - List alleles = parseAlleles(ref, alts, lineNo); - - // find out our current location, and clip the alleles down to their minimum length - int stop = pos; - // ref alleles don't need to be single bases for monomorphic sites - if ( alleles.size() == 1 ) { - stop = pos + alleles.get(0).length() - 1; - } - // we need to parse the INFO field to check for an END tag if it's a symbolic allele - else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() && attrs.containsKey(VCFConstants.END_KEY) ) { - try { - stop = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()); - } catch (Exception e) { - generateException("the END value in the INFO field is not valid"); - } - } else { - final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(pos, ref, alleles); - if ( clipped.getError() != null ) - generateException(clipped.getError(), lineNo); - stop = clipped.getStop(); - alleles = clipped.clippedAlleles; - } - builder.stop(stop); - builder.alleles(alleles); + final List rawAlleles = parseAlleles(ref, alts, lineNo); + final List alleles = updateBuilderAllelesAndStop(builder, ref, pos, rawAlleles, attrs); // do we have genotyping data - if (parts.length > NUM_STANDARD_FIELDS) { + if (parts.length > NUM_STANDARD_FIELDS && includeGenotypes) { final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); final int nGenotypes = header.getNGenotypeSamples(); LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); @@ -374,6 +284,31 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec return vc; } + private final List updateBuilderAllelesAndStop(final VariantContextBuilder builder, + final String ref, + final int pos, + final List rawAlleles, + final Map attrs) { + int endForSymbolicAlleles = pos; // by default we use the pos + if ( attrs.containsKey(VCFConstants.END_KEY) ) { + // update stop with the end key if provided + try { + endForSymbolicAlleles = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()); + } catch (Exception e) { + generateException("the END value in the INFO field is not valid"); + } + } + + // find out our current location, and clip the alleles down to their minimum length + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(pos, ref, rawAlleles, endForSymbolicAlleles); + if ( clipped.getError() != null ) + generateException(clipped.getError(), lineNo); + + builder.stop(clipped.getStop()); + builder.alleles(clipped.getClippedAlleles()); + return clipped.getClippedAlleles(); + } + /** * get the name of this codec * @return our set name @@ -761,4 +696,13 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec public final void disableOnTheFlyModifications() { doOnTheFlyModifications = false; } + + + protected void generateException(String message) { + throw new UserException.MalformedVCF(message, lineNo); + } + + protected static void generateException(String message, int lineNo) { + throw new UserException.MalformedVCF(message, lineNo); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java index 7f11f8784..40ba23d9d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -46,47 +47,52 @@ import java.util.*; * VCF files. * * TODO rethink this class, make it clean, and make it easy to create, mix, and write out alleles + * TODO this code doesn't work with reverse clipped alleles (ATA / GTTA -> AT / GT) * * @author Mark DePristo * @since 6/12 */ public final class VCFAlleleClipper { - // TODO - // TODO - // TODO - // TODO I have honestly no idea what this code is really supposed to do - // TODO computeForwardClipping is called in two places: - // TODO - // TODO clipAlleles() where all alleles, including ref, are passed in - // TODO createVariantContextWithTrimmedAlleles() where only the alt alleles are passed in - // TODO - // TODO The problem is that the code allows us to clip - // TODO - // TODO - // TODO - // TODO - // TODO - // TODO - @Ensures("result == 0 || result == 1") - public static int computeForwardClipping(final List unclippedAlleles, - final byte ref0) { - boolean clipping = true; - int symbolicAlleleCount = 0; + private VCFAlleleClipper() { } + + /** + * Determine whether we should clip off the first base of all unclippped alleles or not + * + * Returns true if all of the alleles in unclippedAlleles share a common first base with + * ref0. Ref0 should be the first base of the reference allele UnclippedAlleles may + * contain the reference allele itself, or just the alternate alleles, it doesn't matter. + * + * The algorithm returns true if the first base should be clipped off, or false otherwise + * + * This algorithm works even in the presence of symbolic alleles, logically ignoring these + * values. It + * + * @param unclippedAlleles list of unclipped alleles to assay + * @param ref0 the first base of the reference allele + * @return true if we should clip the first base of unclippedAlleles + */ + @Requires("unclippedAlleles != null") + public static boolean shouldClipFirstBaseP(final List unclippedAlleles, + final byte ref0) { + boolean allSymbolicAlt = true; for ( final Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) { - symbolicAlleleCount++; continue; } + // already know we aren't symbolic, so we only need to decide if we have only seen a ref + if ( ! a.isReference() ) + allSymbolicAlt = false; + if ( a.length() < 1 || (a.getBases()[0] != ref0) ) { - clipping = false; - break; + return false; } } - // don't clip if all alleles are symbolic - return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; + // to reach here all alleles are consistent with clipping the first base matching ref0 + // but we don't clip if all ALT alleles are symbolic + return ! allSymbolicAlt; } public static int computeReverseClipping(final List unclippedAlleles, @@ -174,48 +180,64 @@ public final class VCFAlleleClipper { * @param unclippedAlleles the list of unclipped alleles, including the reference allele * @return the new reference end position of this event */ - @Requires({"position > 0", "ref != null && ref.length() > 0", "!unclippedAlleles.isEmpty()"}) + @Requires({"start > 0", "ref != null && ref.length() > 0", "!unclippedAlleles.isEmpty()"}) @Ensures("result != null") public static ClippedAlleles clipAlleles(final int start, final String ref, - final List unclippedAlleles) { + final List unclippedAlleles, + final int endForSymbolicAllele ) { // no variation or single nucleotide events are by definition fully clipped if ( unclippedAlleles.size() == 1 || isSingleNucleotideEvent(unclippedAlleles) ) - return new ClippedAlleles(start, unclippedAlleles); - - // symbolic alleles aren't unclipped by default, and there can be only two (better than highlander style) - if ( unclippedAlleles.size() > 1 ) { - if ( unclippedAlleles.get(1).isSymbolic() ) { - if ( unclippedAlleles.size() > 2 ) - return new ClippedAlleles("Detected multiple alleles at a site, including a symbolic one, which is currently unsupported"); - // we do not unclip symbolic alleles - return new ClippedAlleles(start, unclippedAlleles); - } - } + return new ClippedAlleles(start, unclippedAlleles, null); // we've got to sort out the clipping by looking at the alleles themselves - final int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); + final byte firstRefBase = (byte) ref.charAt(0); + final boolean firstBaseIsClipped = shouldClipFirstBaseP(unclippedAlleles, firstRefBase); + final int forwardClipping = firstBaseIsClipped ? 1 : 0; final int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false); + final boolean needsClipping = forwardClipping > 0 || reverseClipping > 0; if ( reverseClipping == -1 ) return new ClippedAlleles("computeReverseClipping failed due to bad alleles"); - final List clippedAlleles = new ArrayList(unclippedAlleles.size()); - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) { - clippedAlleles.add(a); - } else { - final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length - reverseClipping); - if ( !Allele.acceptableAlleleBases(allele) ) - return new ClippedAlleles("Unparsable vcf record with bad allele [" + allele + "]"); - clippedAlleles.add(Allele.create(allele, a.isReference())); + boolean sawSymbolic = false; + List clippedAlleles; + if ( ! needsClipping ) { + // there's nothing to clip, so clippedAlleles are the original alleles + clippedAlleles = unclippedAlleles; + } else { + clippedAlleles = new ArrayList(unclippedAlleles.size()); + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + sawSymbolic = true; + clippedAlleles.add(a); + } else { + final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length - reverseClipping); + if ( !Allele.acceptableAlleleBases(allele) ) + return new ClippedAlleles("Unparsable vcf record with bad allele [" + allele + "]"); + clippedAlleles.add(Allele.create(allele, a.isReference())); + } } } - // the new reference length - final int refLength = ref.length() - reverseClipping; - final int stop = start + Math.max(refLength - 1, 0); - return new ClippedAlleles(stop, clippedAlleles); + int stop = VariantContextUtils.computeEndFromAlleles(clippedAlleles, start, endForSymbolicAllele); + + // TODO + // TODO + // TODO COMPLETELY BROKEN CODE -- THE GATK CURRENTLY ENCODES THE STOP POSITION FOR CLIPPED ALLELES AS + 1 + // TODO ITS TRUE SIZE TO DIFFERENTIATE CLIPPED VS. UNCLIPPED ALLELES. NEEDS TO BE FIXED + // TODO + // TODO + if ( needsClipping && ! sawSymbolic && ! clippedAlleles.get(0).isNull() ) stop++; + // TODO + // TODO + // TODO COMPLETELY BROKEN CODE -- THE GATK CURRENTLY ENCODES THE STOP POSITION FOR CLIPPED ALLELES AS + 1 + // TODO ITS TRUE SIZE TO DIFFERENTIATE CLIPPED VS. UNCLIPPED ALLELES. NEEDS TO BE FIXED + // TODO + // TODO + + final Byte refBaseForIndel = firstBaseIsClipped ? firstRefBase : null; + return new ClippedAlleles(stop, clippedAlleles, refBaseForIndel); } /** @@ -243,8 +265,9 @@ public final class VCFAlleleClipper { else if ( !inputVC.hasSymbolicAlleles() ) throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - else - return false; + else if ( inputVC.isMixed() && inputVC.hasSymbolicAlleles() ) + throw new IllegalArgumentException("GATK infrastructure limitation prevents needsPadding from working properly with VariantContexts containing a mixture of symbolic and concrete alleles at " + inputVC); + return false; } public static Allele padAllele(final VariantContext vc, final Allele allele) { @@ -271,26 +294,41 @@ public final class VCFAlleleClipper { throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); final ArrayList alleles = new ArrayList(inputVC.getNAlleles()); - final Map unpaddedToPadded = new HashMap(inputVC.getNAlleles()); + final Map unpaddedToPadded = inputVC.hasGenotypes() ? new HashMap(inputVC.getNAlleles()) : null; + boolean paddedAtLeastOne = false; for (final Allele a : inputVC.getAlleles()) { final Allele padded = padAllele(inputVC, a); + paddedAtLeastOne = paddedAtLeastOne || padded != a; alleles.add(padded); - unpaddedToPadded.put(a, padded); + if ( unpaddedToPadded != null ) unpaddedToPadded.put(a, padded); // conditional to avoid making unnecessary make } - // now we can recreate new genotypes with trimmed alleles - GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); - for (final Genotype g : inputVC.getGenotypes() ) { - final List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); - for (final Allele a : g.getAlleles()) { - newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL); + if ( ! paddedAtLeastOne ) + throw new ReviewedStingException("VC was supposed to need padding but no allele was actually changed at location " + inputVC.getChr() + ":" + inputVC.getStart() + " with allele " + inputVC.getAlleles()); + + final VariantContextBuilder vcb = new VariantContextBuilder(inputVC); + vcb.alleles(alleles); + + // the position of the inputVC is one further, if it doesn't contain symbolic alleles + vcb.computeEndFromAlleles(alleles, inputVC.getStart(), inputVC.getEnd()); + + if ( inputVC.hasGenotypes() ) { + assert unpaddedToPadded != null; + + // now we can recreate new genotypes with trimmed alleles + final GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); + for (final Genotype g : inputVC.getGenotypes() ) { + final List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); + for (final Allele a : g.getAlleles()) { + newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL); + } + genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make()); } - genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make()); - + vcb.genotypes(genotypes); } - return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); + return vcb.make(); } else return inputVC; @@ -337,33 +375,60 @@ public final class VCFAlleleClipper { return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make(); } + @Invariant("stop != -1 || error != null") // we're either an error or a meaningful result but not both public static class ClippedAlleles { - final int stop; - final List clippedAlleles; - final String error; + private final int stop; + private final List clippedAlleles; + private final Byte refBaseForIndel; + private final String error; - public ClippedAlleles(final int stop, final List clippedAlleles) { + @Requires({"stop > 0", "clippedAlleles != null"}) + private ClippedAlleles(final int stop, final List clippedAlleles, final Byte refBaseForIndel) { this.stop = stop; this.clippedAlleles = clippedAlleles; this.error = null; + this.refBaseForIndel = refBaseForIndel; } - public ClippedAlleles(final String error) { + @Requires("error != null") + private ClippedAlleles(final String error) { this.stop = -1; this.clippedAlleles = null; + this.refBaseForIndel = null; this.error = error; } + /** + * Get an error if it occurred + * @return the error message, or null if no error occurred + */ public String getError() { return error; } + /** + * Get the stop position to use after the clipping as been applied, given the + * provided position to clipAlleles + * @return + */ public int getStop() { return stop; } + /** + * Get the clipped alleles themselves + * @return the clipped alleles in the order of the input unclipped alleles + */ public List getClippedAlleles() { return clippedAlleles; } + + /** + * Returns the reference base we should use for indels, or null if none is appropriate + * @return + */ + public Byte getRefBaseForIndel() { + return refBaseForIndel; + } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 5cd4766c6..c62afd284 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1176,7 +1176,7 @@ public class VariantContext implements Feature { // to enable tribble integratio // if ( getType() == Type.INDEL ) { // if ( getReference().length() != (getLocation().size()-1) ) { long length = (stop - start) + 1; - if ( ! isSymbolic() + if ( ! hasSymbolicAlleles() && ((getReference().isNull() && length != 1 ) || (getReference().isNonNull() && (length - getReference().length() > 1)))) { throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); @@ -1477,7 +1477,11 @@ public class VariantContext implements Feature { // to enable tribble integratio } public boolean hasSymbolicAlleles() { - for (final Allele a: getAlleles()) { + return hasSymbolicAlleles(getAlleles()); + } + + public static boolean hasSymbolicAlleles( final List alleles ) { + for ( final Allele a: alleles ) { if (a.isSymbolic()) { return true; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index 01d3ab456..0590329c4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -454,6 +454,32 @@ public class VariantContextBuilder { return this; } + /** + * @see #computeEndFromAlleles(java.util.List, int, int) with endForSymbolicAlleles == -1 + */ + public VariantContextBuilder computeEndFromAlleles(final List alleles, final int start) { + return computeEndFromAlleles(alleles, start, -1); + } + + /** + * Compute the end position for this VariantContext from the alleles themselves + * + * @see VariantContextUtils.computeEndFromAlleles() + * + * assigns this builder the stop position computed. + * + * @param alleles the list of alleles to consider. The reference allele must be the first one + * @param start the known start position of this event + * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 + * if no is expected but will throw an error if one is found + * @return this builder + */ + @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) + public VariantContextBuilder computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { + stop(VariantContextUtils.computeEndFromAlleles(alleles, start, endForSymbolicAlleles)); + return this; + } + /** * @return true if this builder contains fully decoded data * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 33f5a3759..f1bebaa30 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -705,7 +705,7 @@ public class VariantContextUtils { else if (refAllele.isNull()) trimVC = false; else { - trimVC = (VCFAlleleClipper.computeForwardClipping(inputVC.getAlternateAlleles(), (byte) inputVC.getReference().getDisplayString().charAt(0)) > 0); + trimVC = VCFAlleleClipper.shouldClipFirstBaseP(inputVC.getAlternateAlleles(), (byte) inputVC.getReference().getDisplayString().charAt(0)); } // nothing to do if we don't need to trim bases @@ -1370,4 +1370,33 @@ public class VariantContextUtils { return true; // we passed all tests, we matched } + + /** + * Compute the end position for this VariantContext from the alleles themselves + * + * In the trivial case this is a single BP event and end = start (open intervals) + * In general the end is start + ref length - 1, handling the case where ref length == 0 + * However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases + * + * @param alleles the list of alleles to consider. The reference allele must be the first one + * @param start the known start position of this event + * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 + * if no is expected but will throw an error if one is found + * @return this builder + */ + @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) + public static int computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { + final Allele ref = alleles.get(0); + + if ( ref.isNonReference() ) + throw new ReviewedStingException("computeEndFromAlleles requires first allele to be reference"); + + if ( VariantContext.hasSymbolicAlleles(alleles) ) { + if ( endForSymbolicAlleles == -1 ) + throw new ReviewedStingException("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided"); + return endForSymbolicAlleles; + } else { + return start + Math.max(ref.length() - 1, 0); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 108bc42f7..e281475f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -266,6 +266,8 @@ class BCF2Writer extends IndexingVariantContextWriter { if ( needsPadding ) allele = VCFAlleleClipper.padAllele(vc, allele); final byte[] s = allele.getDisplayBases(); + if ( s == null ) + throw new ReviewedStingException("BUG: BCF2Writer encountered null padded allele" + allele); encoder.encodeTypedString(s); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index c5ca22242..6d37ae2de 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -44,11 +44,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "a10a0a20c6402207a2d968113595fde8"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "c1b89186cdfa48b26865bf9737433e1c"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "359a7da40e4803fc5e43e0e5211ef013"); + DTTest("testMultiSample ", "-I " + multiSample, "644e586e53c6f9bfd8e51e24f78da7b3"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index d16d981ac..eaed969a4 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -65,6 +65,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testApplyRecalibration-"+params.inVCF, spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java similarity index 66% rename from public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java index 55139e9e5..8cd051e01 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java @@ -27,12 +27,13 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.variantcontext.*; import org.testng.Assert; +import org.testng.SkipException; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; -public class VCFUtilsUnitTest extends BaseTest { +public class VCFAlleleClipperUnitTest extends BaseTest { // -------------------------------------------------------------------------------- // // Test allele clipping @@ -67,6 +68,15 @@ public class VCFUtilsUnitTest extends BaseTest { } } + public boolean isClipped() { + for ( int i = 0; i < inputs.size(); i++ ) { + if ( inputs.get(i).length() != expected.get(i).length() ) + return true; + } + + return false; + } + public String toString() { return String.format("ClipAllelesTest input=%s expected=%s", inputs, expected); } @@ -102,12 +112,30 @@ public class VCFUtilsUnitTest extends BaseTest { new ClipAllelesTest(10, 11, "AC", "CT", "GG", "AC", "CT", "GG"); // symbolic - new ClipAllelesTest(10, 10, "A", "", "A", ""); - new ClipAllelesTest(50, 50, "G", "G]22:60]", "A", "G]22:60]"); - new ClipAllelesTest(51, 51, "T", "]22:55]T", "A", "]22:55]T"); - new ClipAllelesTest(52, 52, "C", "C[22:51[", "A", "C[22:51["); + new ClipAllelesTest(10, 100, "A", "", "A", ""); + new ClipAllelesTest(50, 50, "G", "G]22:60]", "G", "G]22:60]"); + new ClipAllelesTest(51, 51, "T", "]22:55]T", "T", "]22:55]T"); + new ClipAllelesTest(52, 52, "C", "C[22:51[", "C", "C[22:51["); new ClipAllelesTest(60, 60, "A", "A]22:50]", "A", "A]22:50]"); + // symbolic with alleles that should be clipped + new ClipAllelesTest(10, 100, "A", "", "AA", "-", "", "A"); + new ClipAllelesTest(10, 100, "AA", "", "A", "A", "", "-"); + new ClipAllelesTest(10, 100, "AA", "", "A", "AAA", "A", "", "-", "AA"); + new ClipAllelesTest(10, 100, "AG", "", "A", "AGA", "G", "", "-", "GA"); + new ClipAllelesTest(10, 100, "G", "", "A", "G", "", "A"); + + // clipping from both ends + // + // TODO -- THIS CODE IS BROKEN BECAUSE CLIPPING DOES WORK WITH ALLELES CLIPPED FROM THE END + // +// new ClipAllelesTest(10, 10, "ATA", "ATTA", "-", "T"); +// new ClipAllelesTest(10, 10, "ATAA", "ATTAA", "-", "T"); +// new ClipAllelesTest(10, 10, "ATAAG", "ATTAAG", "-", "T"); +// new ClipAllelesTest(10, 11, "GTA", "ATTA", "G", "AT"); +// new ClipAllelesTest(10, 11, "GTAA", "ATTAA", "G", "AT"); +// new ClipAllelesTest(10, 11, "GTAAG", "ATTAAG", "G", "AT"); + // complex substitutions new ClipAllelesTest(10, 10, "A", "GA", "A", "GA"); @@ -116,12 +144,37 @@ public class VCFUtilsUnitTest extends BaseTest { @Test(dataProvider = "ClipAllelesTest") public void testClipAllelesTest(ClipAllelesTest cfg) { - final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs); + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs, cfg.stop); Assert.assertNull(clipped.getError(), "Unexpected error occurred"); Assert.assertEquals(clipped.getStop(), cfg.stop, "Clipped alleles stop"); Assert.assertEquals(clipped.getClippedAlleles(), cfg.expected, "Clipped alleles"); } + @Test(dataProvider = "ClipAllelesTest", dependsOnMethods = "testClipAllelesTest") + public void testPaddingAllelesInVC(final ClipAllelesTest cfg) { + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs, cfg.stop); + final VariantContext vc = new VariantContextBuilder("x", "1", cfg.position, cfg.stop, clipped.getClippedAlleles()) + .referenceBaseForIndel(clipped.getRefBaseForIndel()).make(); + + if ( vc.isMixed() && vc.hasSymbolicAlleles() ) + throw new SkipException("GATK cannot handle mixed variant contexts with symbolic and concrete alleles. Remove this check when allele clipping and padding is generalized"); + + Assert.assertEquals(VCFAlleleClipper.needsPadding(vc), cfg.isClipped(), "needPadding method"); + + if ( cfg.isClipped() ) { + // TODO + // TODO note that the GATK currently uses a broken approach to the clipped alleles, so the expected stop is + // TODO actually the original stop, as the original stop is +1 its true size. + // TODO + final int expectedStop = vc.getEnd(); // + (vc.hasSymbolicAlleles() ? 0 : 1); + + final VariantContext padded = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc); + Assert.assertEquals(padded.getStart(), vc.getStart(), "padded VC start"); + Assert.assertEquals(padded.getAlleles(), cfg.inputs, "padded VC alleles == original unclipped alleles"); + Assert.assertEquals(padded.getEnd(), expectedStop, "padded VC end should be clipped VC + 1 (added a base to ref allele)"); + Assert.assertFalse(VCFAlleleClipper.needsPadding(padded), "padded VC shouldn't need padding again"); + } + } // -------------------------------------------------------------------------------- //