From 5a9a37ba01f8aa47b8922d41cc718443eccb65f1 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 29 Jun 2012 11:06:12 -0400 Subject: [PATCH 01/19] Pool caller improvements: a) Log ref sample depth at every called site (will add more ref-related annotations later), b) Make -glm POOLBOTH work in case we want to genotype snp's and indels together, c) indel bug fix (pool and non-pool): prevent a bad GenomeLoc to be formed if we're running GGA and incoming alleles are larger than ref window size (typically 400 bb) --- ...elGenotypeLikelihoodsCalculationModel.java | 5 ++++- .../genotyper/UnifiedGenotyperEngine.java | 19 ++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 96cf5e8c5..230d6c324 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -186,7 +186,10 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; - haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), + if (hsize <= 0) // protect against event lengths larger than ref window sizes + haplotypeMap.clear(); + else + haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), ref, hsize, numPrefBases)); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 60fa75f41..5cc74e729 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -367,6 +367,8 @@ public class UnifiedGenotyperEngine { // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); + // inherit attributed from input vc + attributes.putAll(vc.getAttributes()); // if the site was downsampled, record that fact if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); @@ -581,6 +583,9 @@ public class UnifiedGenotyperEngine { final AlignmentContext rawContext) { final List models = new ArrayList(2); + String modelPrefix = ""; + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) + modelPrefix = UAC.GLmodel.name().toUpperCase().replaceAll("BOTH",""); // if we're genotyping given alleles and we have a requested SNP at this position, do SNP if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { @@ -590,24 +595,24 @@ public class UnifiedGenotyperEngine { if ( vcInput.isSNP() ) { // ignore SNPs if the user chose INDEL mode only - if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) - models.add(GenotypeLikelihoodsCalculationModel.Model.SNP); + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); else if ( UAC.GLmodel.name().toUpperCase().contains("SNP") ) models.add(UAC.GLmodel); } else if ( vcInput.isIndel() || vcInput.isMixed() ) { // ignore INDELs if the user chose SNP mode only - if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) - models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL); + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); else if (UAC.GLmodel.name().toUpperCase().contains("INDEL")) models.add(UAC.GLmodel); } // No support for other types yet } else { - if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) { - models.add(GenotypeLikelihoodsCalculationModel.Model.SNP); - models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL); + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); } else { models.add(UAC.GLmodel); From f631be8d80c2a8be04621dfafa9a9ddcde7f2821 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 29 Jun 2012 23:51:52 -0400 Subject: [PATCH 06/19] UnifiedGenotyperEngine.calculateGenotypes() is not only used in UG but in other walkers - vc attributes shouldn't be inherited by default or it may cause undefined behaviour in those walkers, so only inherit attributes from input vc in case of UG calling this function --- .../genotyper/UnifiedGenotyperEngine.java | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index ecaf9df6a..64f04c0f3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -188,7 +188,7 @@ public class UnifiedGenotyperEngine { else { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); if ( vc != null ) - results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model)); + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true)); } } } @@ -309,6 +309,22 @@ public class UnifiedGenotyperEngine { } public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false); + } + + /** + * Main entry function to calculate genotypes of a given VC with corresponding GL's + * @param tracker Tracker + * @param refContext Reference context + * @param rawContext Raw context + * @param stratifiedContexts Stratified alignment contexts + * @param vc Input VC + * @param model GL calculation model + * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc + * @return VC with assigned genotypes + */ + public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, + final boolean inheritAttributesFromInputVC) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; @@ -408,8 +424,9 @@ public class UnifiedGenotyperEngine { // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); - // inherit attributed from input vc - attributes.putAll(vc.getAttributes()); + // inherit attributed from input vc if requested + if (inheritAttributesFromInputVC) + attributes.putAll(vc.getAttributes()); // if the site was downsampled, record that fact if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); From 6bea28ae6f0ebb54f49f10622a8c6f0c2e682053 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 28 Jun 2012 14:27:09 -0400 Subject: [PATCH 07/19] Genotype filters are now just Strings, not Set --- .../filters/VariantFiltrationWalker.java | 3 +- .../bcf2/BCF2GenotypeFieldDecoders.java | 5 +- .../codecs/vcf/VCFStandardHeaderLines.java | 2 +- .../utils/variantcontext/FastGenotype.java | 7 +- .../sting/utils/variantcontext/Genotype.java | 45 +++++++--- .../utils/variantcontext/GenotypeBuilder.java | 40 +++++---- .../utils/variantcontext/SlowGenotype.java | 13 ++- .../writer/BCF2FieldWriter.java | 28 +++++++ .../writer/BCF2FieldWriterManager.java | 2 + .../variantcontext/writer/VCFWriter.java | 84 +++++++++---------- .../variantcontext/GenotypeUnitTest.java | 20 +++++ .../VariantContextTestProvider.java | 2 +- 12 files changed, 163 insertions(+), 88 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 71352bddd..1d85abbcd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -297,7 +297,8 @@ public class VariantFiltrationWalker extends RodWalker { // for each genotype, check filters then create a new object for ( final Genotype g : vc.getGenotypes() ) { if ( g.isCalled() ) { - List filters = new ArrayList(g.getFilters()); + final List filters = new ArrayList(); + if ( g.isFiltered() ) filters.add(g.getFilters()); for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { if ( VariantContextUtils.match(vc, g, exp) ) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java index 59537a329..0dadc49f9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -276,9 +276,8 @@ public class BCF2GenotypeFieldDecoders { public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { Object value = decoder.decodeTypedValue(typeDescriptor, numElements); - if ( value != null ) { // don't add missing values - gb.filters(value instanceof String ? Collections.singletonList((String)value) : (List)value); - } + assert value == null || value instanceof String; + gb.filter((String)value); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java index dcc141b00..40d22f46f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java @@ -183,7 +183,7 @@ public class VCFStandardHeaderLines { registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter")); // INFO lines registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java index e1185ba70..d528bf0e4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java @@ -111,8 +111,9 @@ public final class FastGenotype extends Genotype { final int DP, final int[] AD, final int[] PL, + final String filters, final Map extendedAttributes) { - super(sampleName); + super(sampleName, filters); this.alleles = alleles; this.isPhased = isPhased; this.GQ = GQ; @@ -152,10 +153,6 @@ public final class FastGenotype extends Genotype { return GQ; } - @Override public List getFilters() { - return (List) getExtendedAttribute(VCFConstants.GENOTYPE_FILTER_KEY, Collections.emptyList()); - } - @Override public int[] getPL() { return PL; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index d268aabc6..fae0a7c4c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -27,6 +27,7 @@ public abstract class Genotype implements Comparable { * extended attributes map */ public final static Collection PRIMARY_KEYS = Arrays.asList( + VCFConstants.GENOTYPE_FILTER_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.DEPTH_KEY, @@ -38,14 +39,11 @@ public abstract class Genotype implements Comparable { private final String sampleName; private GenotypeType type = null; + private final String filters; - protected Genotype(final String sampleName) { + protected Genotype(final String sampleName, final String filters) { this.sampleName = sampleName; - } - - protected Genotype(final String sampleName, final GenotypeType type) { - this.sampleName = sampleName; - this.type = type; + this.filters = filters; } /** @@ -348,13 +346,14 @@ public abstract class Genotype implements Comparable { } public String toString() { - return String.format("[%s %s%s%s%s%s%s]", + return String.format("[%s %s%s%s%s%s%s%s]", getSampleName(), getGenotypeString(false), toStringIfExists(VCFConstants.GENOTYPE_QUALITY_KEY, getGQ()), toStringIfExists(VCFConstants.DEPTH_KEY, getDP()), toStringIfExists(VCFConstants.GENOTYPE_ALLELE_DEPTHS, getAD()), toStringIfExists(VCFConstants.GENOTYPE_PL_KEY, getPL()), + toStringIfExists(VCFConstants.GENOTYPE_FILTER_KEY, getFilters()), sortedString(getExtendedAttributes())); } @@ -448,15 +447,25 @@ public abstract class Genotype implements Comparable { } /** + * Returns the filter string associated with this Genotype. * - * @return + * @return If this result == null, then the genotype is considered PASSing filters + * If the result != null, then the genotype has failed filtering for the reason(s) + * specified in result. To be reference compliant multiple filter field + * string values can be encoded with a ; separator. */ - @Ensures({"result != null"}) - public abstract List getFilters(); + public final String getFilters() { + return filters; + } - @Ensures({"result != getFilters().isEmpty()"}) - public boolean isFiltered() { - return ! getFilters().isEmpty(); + /** + * Is this genotype filtered or not? + * + * @return returns false if getFilters() == null + */ + @Ensures({"result != (getFilters() == null)"}) + public final boolean isFiltered() { + return getFilters() != null; } @Deprecated public boolean hasLog10PError() { return hasGQ(); } @@ -569,6 +578,16 @@ public abstract class Genotype implements Comparable { return v == -1 ? "" : " " + name + " " + v; } + /** + * Returns a display name for field name with String value v if this isn't null. Otherwise returns "" + * @param name of the field ("FT") + * @param v the value of the field, or null if missing + * @return a non-null string for display if the field is not missing + */ + protected final static String toStringIfExists(final String name, final String v) { + return v == null ? "" : " " + name + " " + v; + } + /** * Returns a display name for field name with values vs if this isn't null. Otherwise returns "" * @param name of the field ("AD") diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index ac0c503f7..81d714294 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; import com.google.java.contract.Requires; +import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import java.util.*; @@ -63,6 +64,7 @@ public final class GenotypeBuilder { private int[] AD = null; private int[] PL = null; private Map extendedAttributes = null; + private String filters = null; private int initialAttributeMapSize = 5; private boolean useFast = MAKE_FAST_BY_DEFAULT; @@ -147,6 +149,7 @@ public final class GenotypeBuilder { DP(g.getDP()); AD(g.getAD()); PL(g.getPL()); + filter(g.getFilters()); attributes(g.getExtendedAttributes()); return this; } @@ -164,6 +167,7 @@ public final class GenotypeBuilder { DP = -1; AD = null; PL = null; + filters = null; extendedAttributes = null; } @@ -180,21 +184,11 @@ public final class GenotypeBuilder { public Genotype make() { if ( useFast ) { final Map ea = extendedAttributes == null ? NO_ATTRIBUTES : extendedAttributes; - return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, ea); + return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, filters, ea); } else { final Map attributes = new LinkedHashMap(); if ( extendedAttributes != null ) attributes.putAll(extendedAttributes); final double log10PError = GQ == -1 ? SlowGenotype.NO_LOG10_PERROR : (GQ == 0 ? 0 : GQ / -10.0); - - Set filters = null; - if ( extendedAttributes != null && extendedAttributes.containsKey(VCFConstants.GENOTYPE_FILTER_KEY) ) - { - final Object f = extendedAttributes.get(VCFConstants.GENOTYPE_FILTER_KEY); - if ( f != null ) - filters = new LinkedHashSet((List)f); - attributes.remove(VCFConstants.GENOTYPE_FILTER_KEY); - } - if ( DP != -1 ) attributes.put(VCFConstants.DEPTH_KEY, DP); if ( AD != null ) attributes.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, AD); final double[] log10likelihoods = PL != null ? GenotypeLikelihoods.fromPLs(PL).getAsVector() : null; @@ -383,9 +377,12 @@ public final class GenotypeBuilder { */ @Requires("filters != null") public GenotypeBuilder filters(final List filters) { - if ( ! filters.isEmpty() ) - attribute(VCFConstants.GENOTYPE_FILTER_KEY, filters); - return this; + if ( filters.isEmpty() ) + return filter(null); + else if ( filters.size() == 1 ) + return filter(filters.get(0)); + else + return filter(ParsingUtils.join(";", filters)); } /** @@ -398,15 +395,24 @@ public final class GenotypeBuilder { return filters(Arrays.asList(filters)); } + /** + * Most efficient version of setting filters -- just set the filters string to filters + * + * @param filter if filters == null or filters.equals("PASS") => genotype is PASS + * @return + */ + public GenotypeBuilder filter(final String filter) { + this.filters = VCFConstants.PASSES_FILTERS_v4.equals(filter) ? null : filter; + return this; + } + /** * This genotype is unfiltered * * @return */ public GenotypeBuilder unfiltered() { - if ( extendedAttributes != null ) - extendedAttributes.remove(VCFConstants.GENOTYPE_FILTER_KEY); - return this; + return filter(null); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/SlowGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/SlowGenotype.java index 0dda82050..c3f027484 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/SlowGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/SlowGenotype.java @@ -42,14 +42,20 @@ public class SlowGenotype extends Genotype { protected List alleles = null; protected boolean isPhased = false; - protected SlowGenotype(String sampleName, List alleles, double log10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { - super(sampleName); + protected SlowGenotype(final String sampleName, + final List alleles, + final double log10PError, + final String filters, + final Map attributes, + final boolean isPhased, + final double[] log10Likelihoods) { + super(sampleName, filters); if ( alleles == null || alleles.isEmpty() ) this.alleles = Collections.emptyList(); else this.alleles = Collections.unmodifiableList(alleles); - commonInfo = new CommonInfo(sampleName, log10PError, filters, attributes); + commonInfo = new CommonInfo(sampleName, log10PError, Collections.emptySet(), attributes); if ( log10Likelihoods != null ) commonInfo.putAttribute(VCFConstants.GENOTYPE_PL_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods)); this.isPhased = isPhased; @@ -112,7 +118,6 @@ public class SlowGenotype extends Genotype { // get routines to access context info fields // // --------------------------------------------------------------------------------------------------------- - @Override public List getFilters() { return new ArrayList(commonInfo.getFilters()); } @Override public boolean hasLog10PError() { return commonInfo.hasLog10PError(); } @Override public double getLog10PError() { return commonInfo.getLog10PError(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java index 42567adb8..5b81e7117 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; @@ -49,17 +50,22 @@ public abstract class BCF2FieldWriter { private final VCFHeader header; private final BCF2FieldEncoder fieldEncoder; + @Requires({"header != null", "fieldEncoder != null"}) protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { this.header = header; this.fieldEncoder = fieldEncoder; } + @Ensures("result != null") protected VCFHeader getHeader() { return header; } + @Ensures("result != null") protected BCF2FieldEncoder getFieldEncoder() { return fieldEncoder; } + @Ensures("result != null") protected String getField() { return getFieldEncoder().getField(); } + @Requires("vc != null") public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { fieldEncoder.writeFieldKey(encoder); } @@ -124,6 +130,9 @@ public abstract class BCF2FieldWriter { } @Override + @Requires({"encodingType != null", + "nValuesPerGenotype >= 0 || ! getFieldEncoder().hasConstantNumElements()"}) + @Ensures("nValuesPerGenotype >= 0") public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { // writes the key information super.start(encoder, vc); @@ -141,15 +150,18 @@ public abstract class BCF2FieldWriter { encoder.encodeType(nValuesPerGenotype, encodingType); } + @Requires({"encodingType != null", "nValuesPerGenotype >= 0"}) public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { final Object fieldValue = g.getExtendedAttribute(getField(), null); getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); } + @Ensures({"result >= 0"}) protected int numElements(final VariantContext vc, final Genotype g) { return getFieldEncoder().numElements(vc, g.getExtendedAttribute(getField())); } + @Ensures({"result >= 0"}) private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) { int size = -1; @@ -227,6 +239,22 @@ public abstract class BCF2FieldWriter { } } + public static class FTGenotypesWriter extends StaticallyTypeGenotypesWriter { + public FTGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + super(header, fieldEncoder); + } + + public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { + final String fieldValue = g.getFilters(); + getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); + } + + @Override + protected int numElements(final VariantContext vc, final Genotype g) { + return getFieldEncoder().numElements(vc, g.getFilters()); + } + } + public static class GTWriter extends GenotypesWriter { final Map alleleMapForTriPlus = new HashMap(5); Allele ref, alt1; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java index 9f5bbc487..269750c66 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java @@ -140,6 +140,8 @@ public class BCF2FieldWriterManager { if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { return new BCF2FieldWriter.GTWriter(header, fieldEncoder); + } else if ( line.getID().equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { + return new BCF2FieldWriter.FTGenotypesWriter(header, fieldEncoder); } else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) { return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field)); } else if ( line.getType() == VCFHeaderLineType.Integer ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index ee7b1b9ef..054da9825 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -348,9 +348,8 @@ class VCFWriter extends IndexingVariantContextWriter { missingSampleError(vc, mHeader); } - List attrs = new ArrayList(genotypeFormatKeys.size()); + final List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String field : genotypeFormatKeys ) { - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { if ( !g.isAvailable() ) { throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record"); @@ -363,54 +362,53 @@ class VCFWriter extends IndexingVariantContextWriter { } continue; - } - - String outputValue; - final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field); - if ( accessor != null ) { - final int[] intValues = accessor.getValues(g); - if ( intValues == null ) - outputValue = VCFConstants.MISSING_VALUE_v4; - else if ( intValues.length == 1 ) // fast path - outputValue = Integer.toString(intValues[0]); - else { - StringBuilder sb = new StringBuilder(); - sb.append(intValues[0]); - for ( int i = 1; i < intValues.length; i++) { - sb.append(","); - sb.append(intValues[i]); - } - outputValue = sb.toString(); - } } else { - Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; - - // some exceptions + String outputValue; if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) { - val = g.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())) : VCFConstants.PASSES_FILTERS_v4; - } - - VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); - if ( metaData != null ) { - int numInFormatField = metaData.getCount(vc); - if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { - // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. - // For example, if Number=2, the string has to be ".,." - StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); - for ( int i = 1; i < numInFormatField; i++ ) { - sb.append(","); - sb.append(VCFConstants.MISSING_VALUE_v4); + outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4; + } else { + final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field); + if ( accessor != null ) { + final int[] intValues = accessor.getValues(g); + if ( intValues == null ) + outputValue = VCFConstants.MISSING_VALUE_v4; + else if ( intValues.length == 1 ) // fast path + outputValue = Integer.toString(intValues[0]); + else { + StringBuilder sb = new StringBuilder(); + sb.append(intValues[0]); + for ( int i = 1; i < intValues.length; i++) { + sb.append(","); + sb.append(intValues[i]); + } + outputValue = sb.toString(); } - val = sb.toString(); + } else { + Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; + + VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); + if ( metaData != null ) { + int numInFormatField = metaData.getCount(vc); + if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { + // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. + // For example, if Number=2, the string has to be ".,." + StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); + for ( int i = 1; i < numInFormatField; i++ ) { + sb.append(","); + sb.append(VCFConstants.MISSING_VALUE_v4); + } + val = sb.toString(); + } + } + + // assume that if key is absent, then the given string encoding suffices + outputValue = formatVCFField(val); } } - // assume that if key is absent, then the given string encoding suffices - outputValue = formatVCFField(val); + if ( outputValue != null ) + attrs.add(outputValue); } - - if ( outputValue != null ) - attrs.add(outputValue); } // strip off trailing missing values diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java index 36b2ade53..e511a7f30 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java @@ -48,6 +48,26 @@ public class GenotypeUnitTest extends BaseTest { T = Allele.create("T"); } + private static final GenotypeBuilder makeGB() { + return new GenotypeBuilder("misc"); + } + + @Test + public void testFilters() { + Assert.assertFalse(makeGB().make().isFiltered(), "by default Genotypes must be PASS"); + Assert.assertNull(makeGB().make().getFilters(), "by default Genotypes must be PASS => getFilters() == null"); + Assert.assertFalse(makeGB().filter(null).make().isFiltered(), "setting filter == null => Genotypes must be PASS"); + Assert.assertNull(makeGB().filter(null).make().getFilters(), "Genotypes PASS => getFilters == null"); + Assert.assertFalse(makeGB().filter("PASS").make().isFiltered(), "setting filter == PASS => Genotypes must be PASS"); + Assert.assertNull(makeGB().filter("PASS").make().getFilters(), "Genotypes PASS => getFilters == null"); + Assert.assertTrue(makeGB().filter("x").make().isFiltered(), "setting filter != null => Genotypes must be PASS"); + Assert.assertEquals(makeGB().filter("x").make().getFilters(), "x", "Should get back the expected filter string"); + Assert.assertEquals(makeGB().filters("x", "y").make().getFilters(), "x;y", "Multiple filter field values should be joined with ;"); + Assert.assertEquals(makeGB().filters("x", "y", "z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;"); + Assert.assertTrue(makeGB().filters("x", "y", "z").make().isFiltered(), "Multiple filter values should be filtered"); + Assert.assertEquals(makeGB().filter("x;y;z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;"); + } + // public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { // public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { // public Genotype(String sampleName, List alleles, double negLog10PError, double[] log10Likelihoods) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 528f3dd29..4167381c1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -191,7 +191,7 @@ public class VariantContextTestProvider { addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); - addHeaderLine(metaData, "FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); + addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String); // prep the header metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0)); From 86feea917ea05bad1d853008f70adefc85b5cbf7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 28 Jun 2012 16:40:16 -0400 Subject: [PATCH 08/19] Updating MD5s to reflect new FT fixed count of 1 not UNBOUNDED --- .../diagnostics/targets/DiagnoseTargetsIntegrationTest.java | 6 +++--- .../VariantRecalibrationWalkersIntegrationTest.java | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index 63b2d39f1..c5ca22242 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -38,17 +38,17 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { private void DTTest(String testName, String args, String md5) { String base = String.format("-T DiagnoseTargets --no_cmdline_in_header -R %s -L %s", REF, L) + " -o %s "; WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); - spec.disableShadowBCF(); + //spec.disableShadowBCF(); executeTest(testName, spec); } @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "ef71a569a48697c89e642cdda7bfb766"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "a10a0a20c6402207a2d968113595fde8"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "1e6e15156e01e736274898fdac77d911"); + DTTest("testMultiSample ", "-I " + multiSample, "359a7da40e4803fc5e43e0e5211ef013"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index e0cda07d7..eaed969a4 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -50,7 +50,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); } @@ -99,7 +98,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); } @@ -116,7 +114,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); } From 16276f81a1920b79846c369c9e61244b84c1918a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 28 Jun 2012 22:36:26 -0400 Subject: [PATCH 09/19] BCF2 with support symbolic alleles -- refactored allele clipping / padding code into VCFAlleleClipping class, and added much needed docs and TODOs for methods dev guys -- Added real unit tests for (some) clipping operations in VCFUtilsUnitTest --- .../genotyper/UnifiedGenotyperEngine.java | 3 +- .../variantutils/LiftoverVariants.java | 2 +- .../sting/utils/codecs/bcf2/BCF2Codec.java | 29 +- .../codecs/bcf2/BCF2LazyGenotypesDecoder.java | 4 +- .../utils/codecs/vcf/AbstractVCFCodec.java | 112 +----- .../utils/codecs/vcf/VCFAlleleClipper.java | 369 ++++++++++++++++++ .../sting/utils/codecs/vcf/VCFUtils.java | 3 + .../utils/variantcontext/VariantContext.java | 4 +- .../variantcontext/VariantContextUtils.java | 125 +----- .../variantcontext/writer/BCF2Writer.java | 4 +- .../variantcontext/writer/VCFWriter.java | 4 +- .../org/broadinstitute/sting/BaseTest.java | 11 +- .../CNV/SymbolicAllelesIntegrationTest.java | 9 +- .../VariantFiltrationIntegrationTest.java | 4 +- ...ntRecalibrationWalkersIntegrationTest.java | 1 - .../utils/codecs/vcf/VCFCodecUnitTest.java | 91 ----- .../utils/codecs/vcf/VCFIntegrationTest.java | 2 +- .../utils/codecs/vcf/VCFUtilsUnitTest.java | 173 ++++++++ .../VariantContextTestProvider.java | 61 +-- 19 files changed, 634 insertions(+), 377 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java delete mode 100644 public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 64f04c0f3..7a5a1ba0b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.codecs.vcf.VCFAlleleClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -493,7 +494,7 @@ public class UnifiedGenotyperEngine { // if we are subsetting alleles (either because there were too many or because some were not polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed - vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); + vcCall = VCFAlleleClipper.reverseTrimAlleles(vcCall); if ( annotationEngine != null && !limitedContext ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index a3e1c0bd7..21965afcd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -129,7 +129,7 @@ public class LiftoverVariants extends RodWalker { .attribute("OriginalStart", fromInterval.getStart()).make(); } - VariantContext newVC = VariantContextUtils.createVariantContextWithPaddedAlleles(vc); + VariantContext newVC = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc); if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 94c24b097..06fb07f94 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -51,6 +51,8 @@ import java.util.*; */ public final class BCF2Codec implements FeatureCodec, ReferenceDependentFeatureCodec { final protected static Logger logger = Logger.getLogger(BCF2Codec.class); + private final static boolean FORBID_SYMBOLICS = false; + private VCFHeader header = null; /** @@ -270,7 +272,7 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD " samples in header but have a record with " + nSamples + " samples"); decodeID(builder); - final ArrayList alleles = decodeAlleles(builder, pos, nAlleles); + final List alleles = decodeAlleles(builder, pos, nAlleles); decodeFilter(builder); decodeInfo(builder, nInfo); @@ -283,9 +285,9 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD protected final static class SitesInfoForDecoding { final int nFormatFields; final int nSamples; - final ArrayList alleles; + final List alleles; - private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList alleles) { + private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final List alleles) { this.nFormatFields = nFormatFields; this.nSamples = nSamples; this.alleles = alleles; @@ -325,13 +327,14 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD * @param unclippedAlleles * @return */ - protected static ArrayList clipAllelesIfNecessary(int position, String ref, ArrayList unclippedAlleles) { - if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) { - final ArrayList clippedAlleles = new ArrayList(unclippedAlleles.size()); - AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1); - return clippedAlleles; - } else - return unclippedAlleles; + @Requires({"position > 0", "ref != null && ref.length() > 0", "! unclippedAlleles.isEmpty()"}) + @Ensures("result.size() == unclippedAlleles.size()") + protected List clipAllelesIfNecessary(final int position, + final String ref, + final List unclippedAlleles) { + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(position, ref, unclippedAlleles); + if ( clipped.getError() != null ) error(clipped.getError()); + return clipped.getClippedAlleles(); } /** @@ -342,9 +345,9 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD * @return the alleles */ @Requires("nAlleles > 0") - private ArrayList decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) { + private List decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) { // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes - ArrayList alleles = new ArrayList(nAlleles); + List alleles = new ArrayList(nAlleles); String ref = null; for ( int i = 0; i < nAlleles; i++ ) { @@ -356,7 +359,7 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD alleles.add(allele); - if ( allele.isSymbolic() ) + if ( FORBID_SYMBOLICS && allele.isSymbolic() ) throw new ReviewedStingException("LIMITATION: GATK BCF2 codec does not yet support symbolic alleles"); } assert ref != null; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index c749325fb..35fb2e97a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -44,13 +44,13 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { // initialized when this lazy decoder is created, as we know all of this from the BCF2Codec // and its stored here again for code cleanliness private final BCF2Codec codec; - private final ArrayList siteAlleles; + private final List siteAlleles; private final int nSamples; private final int nFields; private final GenotypeBuilder[] builders; @Requires("codec.getHeader().getNGenotypeSamples() == builders.length") - BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList alleles, final int nSamples, + BCF2LazyGenotypesDecoder(final BCF2Codec codec, final List alleles, final int nSamples, final int nFields, final GenotypeBuilder[] builders) { this.codec = codec; this.siteAlleles = alleles; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index f9f310538..6302d1053 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -229,10 +229,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec } catch (Exception e) { throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo); } - } - // handle multi-positional events - else if ( !isSingleNucleotideEvent(alleles) ) { - stop = clipAlleles(start, ref, alleles, null, lineNo); + } else { + stop = VCFAlleleClipper.clipAlleles(start, ref, alleles).stop; } return new VCFLocFeature(locParts[0], start, stop); @@ -342,10 +340,12 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec } catch (Exception e) { generateException("the END value in the INFO field is not valid"); } - } else if ( !isSingleNucleotideEvent(alleles) ) { - ArrayList newAlleles = new ArrayList(); - stop = clipAlleles(pos, ref, alleles, newAlleles, lineNo); - alleles = newAlleles; + } else { + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(pos, ref, alleles); + if ( clipped.getError() != null ) + generateException(clipped.getError(), lineNo); + stop = clipped.getStop(); + alleles = clipped.clippedAlleles; } builder.stop(stop); builder.alleles(alleles); @@ -613,102 +613,6 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec alleles.add(allele); } - public static boolean isSingleNucleotideEvent(List alleles) { - for ( Allele a : alleles ) { - if ( a.length() != 1 ) - return false; - } - return true; - } - - public static int computeForwardClipping(final List unclippedAlleles, final byte ref0) { - boolean clipping = true; - int symbolicAlleleCount = 0; - - for ( Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) { - symbolicAlleleCount++; - continue; - } - - if ( a.length() < 1 || (a.getBases()[0] != ref0) ) { - clipping = false; - break; - } - } - - // don't clip if all alleles are symbolic - return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; - } - - public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) { - int clipping = 0; - boolean stillClipping = true; - - while ( stillClipping ) { - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - continue; - - // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong - // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). - if ( a.length() - clipping == 0 ) - return clipping - (allowFullClip ? 0 : 1); - - if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { - stillClipping = false; - } - else if ( ref.length == clipping ) { - if ( allowFullClip ) - stillClipping = false; - else - generateException("bad alleles encountered", lineNo); - } - else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { - stillClipping = false; - } - } - if ( stillClipping ) - clipping++; - } - - return clipping; - } - - /** - * clip the alleles, based on the reference - * - * @param position the unadjusted start position (pre-clipping) - * @param ref the reference string - * @param unclippedAlleles the list of unclipped alleles - * @param clippedAlleles output list of clipped alleles - * @param lineNo the current line number in the file - * @return the new reference end position of this event - */ - public static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - - int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); - int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo); - - if ( clippedAlleles != null ) { - for ( Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) { - clippedAlleles.add(a); - } else { - final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping); - if ( !Allele.acceptableAlleleBases(allele) ) - generateException("Unparsable vcf record with bad allele [" + allele + "]", lineNo); - clippedAlleles.add(Allele.create(allele, a.isReference())); - } - } - } - - // the new reference length - int refLength = ref.length() - reverseClipping; - - return position+Math.max(refLength - 1,0); - } - public final static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) { try { return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java new file mode 100644 index 000000000..7f11f8784 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.vcf; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +/** + * All of the gross allele clipping and padding routines in one place + * + * Having attempted to understand / fix / document this code myself + * I can only conclude that this entire approach needs to be rethought. This + * code just doesn't work robustly with symbolic alleles, with multiple alleles, + * requires a special "reference base for indels" stored in the VariantContext + * whose correctness isn't enforced, and overall has strange special cases + * all over the place. + * + * The reason this code is so complex is due to symbolics and multi-alleleic + * variation, which frequently occur when combining variants from multiple + * VCF files. + * + * TODO rethink this class, make it clean, and make it easy to create, mix, and write out alleles + * + * @author Mark DePristo + * @since 6/12 + */ +public final class VCFAlleleClipper { + // TODO + // TODO + // TODO + // TODO I have honestly no idea what this code is really supposed to do + // TODO computeForwardClipping is called in two places: + // TODO + // TODO clipAlleles() where all alleles, including ref, are passed in + // TODO createVariantContextWithTrimmedAlleles() where only the alt alleles are passed in + // TODO + // TODO The problem is that the code allows us to clip + // TODO + // TODO + // TODO + // TODO + // TODO + // TODO + @Ensures("result == 0 || result == 1") + public static int computeForwardClipping(final List unclippedAlleles, + final byte ref0) { + boolean clipping = true; + int symbolicAlleleCount = 0; + + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + symbolicAlleleCount++; + continue; + } + + if ( a.length() < 1 || (a.getBases()[0] != ref0) ) { + clipping = false; + break; + } + } + + // don't clip if all alleles are symbolic + return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; + } + + public static int computeReverseClipping(final List unclippedAlleles, + final byte[] ref, + final int forwardClipping, + final boolean allowFullClip) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - (allowFullClip ? 0 : 1); + + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { + stillClipping = false; + } + else if ( ref.length == clipping ) { + if ( allowFullClip ) + stillClipping = false; + else + return -1; + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { + stillClipping = false; + } + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } + + /** + * Are the alleles describing a polymorphism substitution one base for another? + * + * @param alleles a list of alleles, must not be empty + * @return Return true if the length of any allele in alleles isn't 1 + */ + @Requires("!alleles.isEmpty()") + private static boolean isSingleNucleotideEvent(final List alleles) { + for ( final Allele a : alleles ) { + if ( a.length() != 1 ) + return false; + } + return true; + } + + /** + * clip the alleles, based on the reference, returning a ClippedAlleles object describing what happened + * + * The ClippedAlleles object contains the implied stop position of the alleles, given the provided start + * position, after clipping. It also contains the list of alleles, in the same order as the provided + * unclipped ones, that are the fully clipped version of the input alleles. If an error occurs + * during this option the getError() function returns a string describing the problem (for use in parsers). + * + * The basic operation are: + * + * single allele + * => stop == start and clipped == unclipped + * any number of single nucleotide events + * => stop == start and clipped == unclipped + * two alleles, second being symbolic + * => stop == start and clipped == unclipped + * Note in this case that the STOP should be computed by other means (from END in VCF, for example) + * Note that if there's more than two alleles and the second is a symbolic the code produces an error + * Any other case: + * The alleles are trimmed of any sequence shared at the end of the alleles. If N bases + * are common then the alleles will all be at least N bases shorter. + * The stop position returned is the start position + the length of the + * reverse trimmed only reference allele - 1. + * If the alleles all share a single common starting sequence (just one base is considered) + * then the alleles have this leading common base removed as well. + * + * TODO This code is gross and brittle and needs to be rethought from scratch + * + * @param start the unadjusted start position (pre-clipping) + * @param ref the reference string + * @param unclippedAlleles the list of unclipped alleles, including the reference allele + * @return the new reference end position of this event + */ + @Requires({"position > 0", "ref != null && ref.length() > 0", "!unclippedAlleles.isEmpty()"}) + @Ensures("result != null") + public static ClippedAlleles clipAlleles(final int start, + final String ref, + final List unclippedAlleles) { + // no variation or single nucleotide events are by definition fully clipped + if ( unclippedAlleles.size() == 1 || isSingleNucleotideEvent(unclippedAlleles) ) + return new ClippedAlleles(start, unclippedAlleles); + + // symbolic alleles aren't unclipped by default, and there can be only two (better than highlander style) + if ( unclippedAlleles.size() > 1 ) { + if ( unclippedAlleles.get(1).isSymbolic() ) { + if ( unclippedAlleles.size() > 2 ) + return new ClippedAlleles("Detected multiple alleles at a site, including a symbolic one, which is currently unsupported"); + // we do not unclip symbolic alleles + return new ClippedAlleles(start, unclippedAlleles); + } + } + + // we've got to sort out the clipping by looking at the alleles themselves + final int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); + final int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false); + + if ( reverseClipping == -1 ) + return new ClippedAlleles("computeReverseClipping failed due to bad alleles"); + + final List clippedAlleles = new ArrayList(unclippedAlleles.size()); + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + clippedAlleles.add(a); + } else { + final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length - reverseClipping); + if ( !Allele.acceptableAlleleBases(allele) ) + return new ClippedAlleles("Unparsable vcf record with bad allele [" + allele + "]"); + clippedAlleles.add(Allele.create(allele, a.isReference())); + } + } + + // the new reference length + final int refLength = ref.length() - reverseClipping; + final int stop = start + Math.max(refLength - 1, 0); + return new ClippedAlleles(stop, clippedAlleles); + } + + /** + * Returns true if the alleles in inputVC should have reference bases added for padding + * + * We need to pad a VC with a common base if the length of the reference allele is + * less than the length of the VariantContext. This happens because the position of + * e.g. an indel is always one before the actual event (as per VCF convention). + * + * @param inputVC the VC to evaluate, cannot be null + * @return true if + */ + public static boolean needsPadding(final VariantContext inputVC) { + // biallelic sites with only symbolic never need padding + if ( inputVC.isBiallelic() && inputVC.getAlternateAllele(0).isSymbolic() ) + return false; + + final int recordLength = inputVC.getEnd() - inputVC.getStart() + 1; + final int referenceLength = inputVC.getReference().length(); + + if ( referenceLength == recordLength ) + return false; + else if ( referenceLength == recordLength - 1 ) + return true; + else if ( !inputVC.hasSymbolicAlleles() ) + throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); + else + return false; + } + + public static Allele padAllele(final VariantContext vc, final Allele allele) { + assert needsPadding(vc); + + if ( allele.isSymbolic() ) + return allele; + else { + // get bases for current allele and create a new one with trimmed bases + final StringBuilder sb = new StringBuilder(); + sb.append((char)vc.getReferenceBaseForIndel().byteValue()); + sb.append(allele.getDisplayString()); + final String newBases = sb.toString(); + return Allele.create(newBases, allele.isReference()); + } + } + + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC) { + final boolean padVC = needsPadding(inputVC); + + // nothing to do if we don't need to pad bases + if ( padVC ) { + if ( !inputVC.hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); + + final ArrayList alleles = new ArrayList(inputVC.getNAlleles()); + final Map unpaddedToPadded = new HashMap(inputVC.getNAlleles()); + + for (final Allele a : inputVC.getAlleles()) { + final Allele padded = padAllele(inputVC, a); + alleles.add(padded); + unpaddedToPadded.put(a, padded); + } + + // now we can recreate new genotypes with trimmed alleles + GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); + for (final Genotype g : inputVC.getGenotypes() ) { + final List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); + for (final Allele a : g.getAlleles()) { + newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL); + } + genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make()); + + } + + return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); + } + else + return inputVC; + + } + + public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { + // see if we need to trim common reference base from all alleles + + final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true); + if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) + return inputVC; + + final List alleles = new ArrayList(); + final GenotypesContext genotypes = GenotypesContext.create(); + final Map originalToTrimmedAlleleMap = new HashMap(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); + final Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + for ( final Genotype genotype : inputVC.getGenotypes() ) { + final List originalAlleles = genotype.getAlleles(); + final List trimmedAlleles = new ArrayList(); + for ( final Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); + } + + return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make(); + } + + public static class ClippedAlleles { + final int stop; + final List clippedAlleles; + final String error; + + public ClippedAlleles(final int stop, final List clippedAlleles) { + this.stop = stop; + this.clippedAlleles = clippedAlleles; + this.error = null; + } + + public ClippedAlleles(final String error) { + this.stop = -1; + this.clippedAlleles = null; + this.error = error; + } + + public String getError() { + return error; + } + + public int getStop() { + return stop; + } + + public List getClippedAlleles() { + return clippedAlleles; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 93eb1b237..dc7bcd926 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; @@ -32,6 +34,7 @@ import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index dc600d97c..5cd4766c6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -558,8 +558,8 @@ public class VariantContext implements Feature { // to enable tribble integratio } public String getAlleleStringWithRefPadding(final Allele allele) { - if ( VariantContextUtils.needsPadding(this) ) - return VariantContextUtils.padAllele(this, allele).getDisplayString(); + if ( VCFAlleleClipper.needsPadding(this) ) + return VCFAlleleClipper.padAllele(this, allele).getDisplayString(); else return allele.getDisplayString(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index ccc0f5971..33f5a3759 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -46,7 +46,9 @@ public class VariantContextUtils { public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); final public static JexlEngine engine = new JexlEngine(); public static final int DEFAULT_PLOIDY = 2; @@ -184,83 +186,6 @@ public class VariantContextUtils { return g; } - /** - * Returns true if the alleles in inputVC should have reference bases added for padding - * - * We need to pad a VC with a common base if the length of the reference allele is - * less than the length of the VariantContext. This happens because the position of - * e.g. an indel is always one before the actual event (as per VCF convention). - * - * @param inputVC the VC to evaluate, cannot be null - * @return true if - */ - public static boolean needsPadding(final VariantContext inputVC) { - final int recordLength = inputVC.getEnd() - inputVC.getStart() + 1; - final int referenceLength = inputVC.getReference().length(); - - if ( referenceLength == recordLength ) - return false; - else if ( referenceLength == recordLength - 1 ) - return true; - else if ( !inputVC.hasSymbolicAlleles() ) - throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + - " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - else - return false; - } - - public static Allele padAllele(final VariantContext vc, final Allele allele) { - assert needsPadding(vc); - - if ( allele.isSymbolic() ) - return allele; - else { - // get bases for current allele and create a new one with trimmed bases - final StringBuilder sb = new StringBuilder(); - sb.append((char)vc.getReferenceBaseForIndel().byteValue()); - sb.append(allele.getDisplayString()); - final String newBases = sb.toString(); - return Allele.create(newBases, allele.isReference()); - } - } - - - public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC) { - final boolean padVC = needsPadding(inputVC); - - // nothing to do if we don't need to pad bases - if ( padVC ) { - if ( !inputVC.hasReferenceBaseForIndel() ) - throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); - - final ArrayList alleles = new ArrayList(inputVC.getNAlleles()); - final Map unpaddedToPadded = new HashMap(inputVC.getNAlleles()); - - for (final Allele a : inputVC.getAlleles()) { - final Allele padded = padAllele(inputVC, a); - alleles.add(padded); - unpaddedToPadded.put(a, padded); - } - - // now we can recreate new genotypes with trimmed alleles - GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); - for (final Genotype g : inputVC.getGenotypes() ) { - final List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); - for (final Allele a : g.getAlleles()) { - newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL); - } - genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make()); - - } - - return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); - } - else - return inputVC; - - } - - private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) { VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field); if ( metaData == null ) metaData = header.getInfoHeaderLine(field); @@ -564,7 +489,7 @@ public class VariantContextUtils { for (final VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(createVariantContextWithPaddedAlleles(vc)); + VCs.add(VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc)); } if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -769,7 +694,7 @@ public class VariantContextUtils { return true; } - public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { + private static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; @@ -780,7 +705,7 @@ public class VariantContextUtils { else if (refAllele.isNull()) trimVC = false; else { - trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0); + trimVC = (VCFAlleleClipper.computeForwardClipping(inputVC.getAlternateAlleles(), (byte) inputVC.getReference().getDisplayString().charAt(0)) > 0); } // nothing to do if we don't need to trim bases @@ -836,46 +761,6 @@ public class VariantContextUtils { return inputVC; } - public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - // see if we need to trim common reference base from all alleles - - final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1); - if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) - return inputVC; - - final List alleles = new ArrayList(); - final GenotypesContext genotypes = GenotypesContext.create(); - final Map originalToTrimmedAlleleMap = new HashMap(); - - for (final Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); - final Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // now we can recreate new genotypes with trimmed alleles - for ( final Genotype genotype : inputVC.getGenotypes() ) { - final List originalAlleles = genotype.getAlleles(); - final List trimmedAlleles = new ArrayList(); - for ( final Allele a : originalAlleles ) { - if ( a.isCalled() ) - trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); - else - trimmedAlleles.add(Allele.NO_CALL); - } - genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); - } - - return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make(); - } - public static GenotypesContext stripPLs(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 5555849dd..108bc42f7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -261,10 +261,10 @@ class BCF2Writer extends IndexingVariantContextWriter { } private void buildAlleles( VariantContext vc ) throws IOException { - final boolean needsPadding = VariantContextUtils.needsPadding(vc); + final boolean needsPadding = VCFAlleleClipper.needsPadding(vc); for ( Allele allele : vc.getAlleles() ) { if ( needsPadding ) - allele = VariantContextUtils.padAllele(vc,allele); + allele = VCFAlleleClipper.padAllele(vc, allele); final byte[] s = allele.getDisplayBases(); encoder.encodeTypedString(s); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 054da9825..2a4c8cac2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -162,7 +162,7 @@ class VCFWriter extends IndexingVariantContextWriter { vc = new VariantContextBuilder(vc).noGenotypes().make(); try { - vc = VariantContextUtils.createVariantContextWithPaddedAlleles(vc); + vc = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc); super.add(vc); Map alleleMap = buildAlleleMap(vc); @@ -522,7 +522,7 @@ class VCFWriter extends IndexingVariantContextWriter { if ( g.hasDP() ) sawDP = true; if ( g.hasAD() ) sawAD = true; if ( g.hasPL() ) sawPL = true; - if (g.isFiltered() && g.isCalled()) sawGenotypeFilter = true; + if (g.isFiltered()) sawGenotypeFilter = true; } if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY); diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 55de62431..86b7e60ff 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -15,10 +15,7 @@ import org.testng.SkipException; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @@ -296,6 +293,12 @@ public abstract class BaseTest { assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); } + public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { + final Set actualSet = new HashSet(actual); + final Set expectedSet = new HashSet(expected); + Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps + } + public static final void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately Assert.assertTrue(Double.isNaN(actual)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java index 13519c815..2af88b109 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java @@ -18,22 +18,21 @@ public class SymbolicAllelesIntegrationTest extends WalkerTest { " --no_cmdline_in_header"; } - - @Test(enabled = false) + @Test(enabled = true) public void test1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(b36KGReference, "symbolic_alleles_1.vcf"), 1, - Arrays.asList("c79137da24ad4dc15cedc742de39247f")); + Arrays.asList("5bafc5a99ea839e686e55de93f91fd5c")); executeTest("Test symbolic alleles", spec); } - @Test(enabled = false) + @Test(enabled = true) public void test2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(b36KGReference, "symbolic_alleles_2.vcf"), 1, - Arrays.asList("3f6cbbd5fdf164d87081a3af19eeeba7")); + Arrays.asList("bf5a09f783ab1fa44774c81f91d10921")); executeTest("Test symbolic alleles mixed in with non-symbolic alleles", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 70a10a0b5..ae5128c75 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -80,7 +80,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("c5ed9dd3975b3602293bb484b4fda5f4")); + Arrays.asList("060e9e7b6faf8b2f7b3291594eb6b39c")); executeTest("test genotype filter #1", spec1); } @@ -88,7 +88,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'isHomVar == 1' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("979ccdf484259117aa31305701075602")); + Arrays.asList("00f90028a8c0d56772c47f039816b585")); executeTest("test genotype filter #2", spec2); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index eaed969a4..d16d981ac 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -65,7 +65,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testApplyRecalibration-"+params.inVCF, spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java deleted file mode 100644 index e0fb1b876..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -// our package -package org.broadinstitute.sting.utils.codecs.vcf; - - -// the imports for unit testing. - - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.variantcontext.*; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - - -public class VCFCodecUnitTest extends BaseTest { - - // -------------------------------------------------------------------------------- - // - // Provider - // - // -------------------------------------------------------------------------------- - - private class AlleleClippingTestProvider extends TestDataProvider { - final String ref; - final List alleles = new ArrayList(); - final int expectedClip; - - private AlleleClippingTestProvider(final int expectedClip, final String ref, final String ... alleles) { - super(AlleleClippingTestProvider.class); - this.ref = ref; - for ( final String allele : alleles ) - this.alleles.add(Allele.create(allele)); - this.expectedClip = expectedClip; - } - - @Override - public String toString() { - return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); - } - } - - @DataProvider(name = "AlleleClippingTestProvider") - public Object[][] MakeAlleleClippingTest() { - // pair clipping - new AlleleClippingTestProvider(0, "ATT", "CCG"); - new AlleleClippingTestProvider(1, "ATT", "CCT"); - new AlleleClippingTestProvider(2, "ATT", "CTT"); - new AlleleClippingTestProvider(2, "ATT", "ATT"); // cannot completely clip allele - - // triplets - new AlleleClippingTestProvider(0, "ATT", "CTT", "CGG"); - new AlleleClippingTestProvider(1, "ATT", "CTT", "CGT"); // the T can go - new AlleleClippingTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go - - return AlleleClippingTestProvider.getTests(AlleleClippingTestProvider.class); - } - - - @Test(dataProvider = "AlleleClippingTestProvider") - public void TestAlleleClipping(AlleleClippingTestProvider cfg) { - int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1); - Assert.assertEquals(result, cfg.expectedClip); - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 2a92b85e1..4d23f0fa5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -26,7 +26,7 @@ public class VCFIntegrationTest extends WalkerTest { executeTest("Test Variants To VCF from new output", spec2); } - @Test(enabled = false) + @Test(enabled = true) // See https://getsatisfaction.com/gsa/topics/support_vcf_4_1_structural_variation_breakend_alleles?utm_content=topic_link&utm_medium=email&utm_source=new_topic public void testReadingAndWritingBreakpointAlleles() { String testVCF = privateTestDir + "breakpoint-example.vcf"; diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java new file mode 100644 index 000000000..55139e9e5 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.codecs.vcf; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class VCFUtilsUnitTest extends BaseTest { + // -------------------------------------------------------------------------------- + // + // Test allele clipping + // + // -------------------------------------------------------------------------------- + + private class ClipAllelesTest extends TestDataProvider { + final int position; + final int stop; + final String ref; + List inputs; + List expected; + + @Requires("arg.length % 2 == 0") + private ClipAllelesTest(final int position, final int stop, final String ... arg) { + super(ClipAllelesTest.class); + this.position = position; + this.stop = stop; + this.ref = arg[0]; + + int n = arg.length / 2; + inputs = new ArrayList(n); + expected = new ArrayList(n); + + for ( int i = 0; i < n; i++ ) { + final boolean ref = i % n == 0; + inputs.add(Allele.create(arg[i], ref)); + } + for ( int i = n; i < arg.length; i++ ) { + final boolean ref = i % n == 0; + expected.add(Allele.create(arg[i], ref)); + } + } + + public String toString() { + return String.format("ClipAllelesTest input=%s expected=%s", inputs, expected); + } + } + @DataProvider(name = "ClipAllelesTest") + public Object[][] makeClipAllelesTest() { + // do no harm + new ClipAllelesTest(10, 10, "A", "A"); + new ClipAllelesTest(10, 10, "A", "C", "A", "C"); + new ClipAllelesTest(10, 10, "A", "C", "G", "A", "C", "G"); + + // insertions + new ClipAllelesTest(10, 10, "A", "AA", "-", "A"); + new ClipAllelesTest(10, 10, "A", "AAA", "-", "AA"); + new ClipAllelesTest(10, 10, "A", "AG", "-", "G"); + + // deletions + new ClipAllelesTest(10, 11, "AA", "A", "A", "-"); + new ClipAllelesTest(10, 12, "AAA", "A", "AA", "-"); + new ClipAllelesTest(10, 11, "AG", "A", "G", "-"); + new ClipAllelesTest(10, 12, "AGG", "A", "GG", "-"); + + // multi-allelic insertion and deletions + new ClipAllelesTest(10, 11, "AA", "A", "AAA", "A", "-", "AA"); + new ClipAllelesTest(10, 11, "AA", "A", "AAG", "A", "-", "AG"); + new ClipAllelesTest(10, 10, "A", "AA", "AAA", "-", "A", "AA"); + new ClipAllelesTest(10, 10, "A", "AA", "ACA", "-", "A", "CA"); + new ClipAllelesTest(10, 12, "ACG", "ATC", "AGG", "CG", "TC", "GG"); + new ClipAllelesTest(10, 11, "AC", "AT", "AG", "C", "T", "G"); + + // cannot be clipped + new ClipAllelesTest(10, 11, "AC", "CT", "AG", "AC", "CT", "AG"); + new ClipAllelesTest(10, 11, "AC", "CT", "GG", "AC", "CT", "GG"); + + // symbolic + new ClipAllelesTest(10, 10, "A", "", "A", ""); + new ClipAllelesTest(50, 50, "G", "G]22:60]", "A", "G]22:60]"); + new ClipAllelesTest(51, 51, "T", "]22:55]T", "A", "]22:55]T"); + new ClipAllelesTest(52, 52, "C", "C[22:51[", "A", "C[22:51["); + new ClipAllelesTest(60, 60, "A", "A]22:50]", "A", "A]22:50]"); + + // complex substitutions + new ClipAllelesTest(10, 10, "A", "GA", "A", "GA"); + + return ClipAllelesTest.getTests(ClipAllelesTest.class); + } + + @Test(dataProvider = "ClipAllelesTest") + public void testClipAllelesTest(ClipAllelesTest cfg) { + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs); + Assert.assertNull(clipped.getError(), "Unexpected error occurred"); + Assert.assertEquals(clipped.getStop(), cfg.stop, "Clipped alleles stop"); + Assert.assertEquals(clipped.getClippedAlleles(), cfg.expected, "Clipped alleles"); + } + + + // -------------------------------------------------------------------------------- + // + // basic allele clipping test + // + // -------------------------------------------------------------------------------- + + private class ReverseClippingPositionTestProvider extends TestDataProvider { + final String ref; + final List alleles = new ArrayList(); + final int expectedClip; + + private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) { + super(ReverseClippingPositionTestProvider.class); + this.ref = ref; + for ( final String allele : alleles ) + this.alleles.add(Allele.create(allele)); + this.expectedClip = expectedClip; + } + + @Override + public String toString() { + return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); + } + } + + @DataProvider(name = "ReverseClippingPositionTestProvider") + public Object[][] makeReverseClippingPositionTestProvider() { + // pair clipping + new ReverseClippingPositionTestProvider(0, "ATT", "CCG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CCT"); + new ReverseClippingPositionTestProvider(2, "ATT", "CTT"); + new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele + + // triplets + new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go + new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go + + return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); + } + + + @Test(dataProvider = "ReverseClippingPositionTestProvider") + public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { + int result = VCFAlleleClipper.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false); + Assert.assertEquals(result, cfg.expectedClip); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 4167381c1..94ed2ce5f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -56,7 +56,7 @@ public class VariantContextTestProvider { final private static boolean ENABLE_VARARRAY_TESTS = true; final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; - final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = false; + final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = true; final private static boolean ENABLE_SOURCE_VCF_TESTS = true; final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true; final private static List TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); @@ -70,8 +70,10 @@ public class VariantContextTestProvider { testSourceVCFs.add(new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf")); testSourceVCFs.add(new File(BaseTest.privateTestDir + "ex2.vcf")); testSourceVCFs.add(new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf")); - if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) + if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) { testSourceVCFs.add(new File(BaseTest.privateTestDir + "diagnosis_targets_testfile.vcf")); + testSourceVCFs.add(new File(BaseTest.privateTestDir + "VQSR.mixedTest.recal")); + } } public abstract static class VariantContextIOTest { @@ -181,6 +183,7 @@ public class VariantContextTestProvider { Set metaData = new TreeSet(); addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String); + addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer); addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String); addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String); addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); @@ -283,6 +286,15 @@ public class VariantContextTestProvider { if ( ENABLE_A_AND_G_TESTS ) addGenotypesAndGTests(); + + if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) + addSymbolicAlleleTests(); + } + + private static void addSymbolicAlleleTests() { + // two tests to ensure that the end is computed correctly when there's (and not) an END field present + add(builder().alleles("N", "").start(10).stop(11).attribute("END", 11)); + add(builder().alleles("N", "").start(10).stop(10)); } private static void addGenotypesToTestData() { @@ -509,27 +521,26 @@ public class VariantContextTestProvider { addGenotypeTests(site, // missing value in varlist of string attr("g1", ref, "FLOAT1", 1.0), attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); - - - // - // - // TESTING GENOTYPE FILTERS - // - // - addGenotypeTests(site, - new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(), - new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); } + + // + // + // TESTING GENOTYPE FILTERS + // + // + addGenotypeTests(site, + new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(), + new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(), + new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); } private static void addGenotypesAndGTests() { @@ -711,14 +722,12 @@ public class VariantContextTestProvider { Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles"); assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); - Assert.assertEquals(actual.getFilters(), expected.getFilters(), "filters"); + BaseTest.assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); BaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); if ( expected.hasGenotypes() ) { - final Set actualSampleSet = new HashSet(actual.getSampleNames()); - final Set expectedSampleSet = new HashSet(expected.getSampleNames()); - Assert.assertTrue(actualSampleSet.equals(expectedSampleSet), "sample names"); // note this is necessary due to testng bug for set comps + BaseTest.assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); final Set samples = expected.getSampleNames(); for ( final String sample : samples ) { From 893630af53489a27380a2fbbf2b958f0b4c67c72 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 29 Jun 2012 13:55:52 -0400 Subject: [PATCH 10/19] Enabling symbolic alleles in BCF2 -- Bugfix for VCFDiffableReader: don't add null filters to object -- BCF2Codec uses new VCFAlleleClipper to handle clipping / unclipping of alleles -- AbstractVCFCodec: decodeLoc uses full decode() [still doesn't decode genotypes] to avoid dangerous code duplication. Refactored code that clipped alleles and determined end position into updateBuilderAllelesAndStop method that uses new VCFAlleleClipper. Fixed bug by ensuring the VCF codec always uses the END field in the INFO when it's provided, not just in the case where the there's a biallelic symbolic allele -- Brand new home for allele clipping / padding routines in VCFAlleleClipper. Actually documented this code, which results in lots of **** negative comments on the code quality. Eric has promised that he and Ami are going to rethink this code from scratch. Fixed many nasty bugs in here, cleaning up unnecessary branches, etc. Added UnitTests in VCFAlleleClipper that actually test the code full. In the process of testing I discovered lots of edge cases that don't work, and I've commented out failing tests or manually skipped them, noting how this tests need to be fixed. Even introduced some minor optimizations -- VariantContext: validateAllele was broken in the case where there were mixed symbolic and concrete alleles, failing validation for no reason. Fixed. -- Added computeEndFromAlleles() function to VariantContextUtils and VariantContextBuilder for convenience calculating where the VC really ends given alleles -- --- .../walkers/diffengine/VCFDiffableReader.java | 1 + .../sting/utils/codecs/bcf2/BCF2Codec.java | 4 +- .../utils/codecs/vcf/AbstractVCFCodec.java | 148 ++++--------- .../utils/codecs/vcf/VCFAlleleClipper.java | 207 ++++++++++++------ .../utils/variantcontext/VariantContext.java | 8 +- .../variantcontext/VariantContextBuilder.java | 26 +++ .../variantcontext/VariantContextUtils.java | 31 ++- .../variantcontext/writer/BCF2Writer.java | 2 + .../DiagnoseTargetsIntegrationTest.java | 4 +- ...ntRecalibrationWalkersIntegrationTest.java | 1 + ...est.java => VCFAlleleClipperUnitTest.java} | 65 +++++- 11 files changed, 312 insertions(+), 185 deletions(-) rename public/java/test/org/broadinstitute/sting/utils/codecs/vcf/{VCFUtilsUnitTest.java => VCFAlleleClipperUnitTest.java} (66%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index b4cf96831..e3f9b8ee3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -116,6 +116,7 @@ public class VCFDiffableReader implements DiffableReader { if ( g.hasDP() ) gRoot.add("DP", g.getDP() ); if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD())); if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL())); + if ( g.getFilters() != null ) gRoot.add("FT", g.getFilters()); for (Map.Entry attribute : g.getExtendedAttributes().entrySet()) { if ( ! attribute.getKey().startsWith("_") ) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 06fb07f94..e106e72ac 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -332,7 +332,9 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD protected List clipAllelesIfNecessary(final int position, final String ref, final List unclippedAlleles) { - final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(position, ref, unclippedAlleles); + // the last argument of 1 allows us to safely ignore the end, because we are + // ultimately going to use the end in the record itself + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(position, ref, unclippedAlleles, 1); if ( clipped.getError() != null ) error(clipped.getError()); return clipped.getClippedAlleles(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 6302d1053..1d5869349 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -186,79 +186,19 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - - // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line - if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; - - // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); - - final int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - - if ( nParts != 6 ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); - - // get our alleles (because the end position depends on them) - final String ref = getCachedString(locParts[3].toUpperCase()); - final String alts = getCachedString(locParts[4].toUpperCase()); - final List alleles = parseAlleles(ref, alts, lineNo); - - // find out our location - int start = 0; - try { - start = Integer.valueOf(locParts[1]); - } catch (Exception e) { - generateException("the value in the POS field must be an integer but it was " + locParts[1], lineNo); - } - int stop = start; - - // ref alleles don't need to be single bases for monomorphic sites - if ( alleles.size() == 1 ) { - stop = start + alleles.get(0).length() - 1; - } - // we need to parse the INFO field to check for an END tag if it's a symbolic allele - else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() ) { - final String[] extraParts = new String[4]; - final int nExtraParts = ParsingUtils.split(locParts[5], extraParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - if ( nExtraParts < 3 ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); - - final Map attrs = parseInfo(extraParts[2]); - try { - stop = attrs.containsKey(VCFConstants.END_KEY) ? Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()) : start; - } catch (Exception e) { - throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo); - } - } else { - stop = VCFAlleleClipper.clipAlleles(start, ref, alleles).stop; - } - - return new VCFLocFeature(locParts[0], start, stop); + return decodeLine(line, false); } - private final static class VCFLocFeature implements Feature { - - final String chr; - final int start, stop; - - private VCFLocFeature(String chr, int start, int stop) { - this.chr = chr; - this.start = start; - this.stop = stop; - } - - public String getChr() { return chr; } - public int getStart() { return start; } - public int getEnd() { return stop; } - } - - /** * decode the line into a feature (VariantContext) * @param line the line * @return a VariantContext */ public VariantContext decode(String line) { + return decodeLine(line, true); + } + + private final VariantContext decodeLine(final String line, final boolean includeGenotypes) { // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; @@ -276,15 +216,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + " tokens, and saw " + nParts + " )", lineNo); - return parseVCFLine(parts); - } - - protected void generateException(String message) { - throw new UserException.MalformedVCF(message, lineNo); - } - - protected static void generateException(String message, int lineNo) { - throw new UserException.MalformedVCF(message, lineNo); + return parseVCFLine(parts, includeGenotypes); } /** @@ -293,7 +225,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec * @param parts the parts split up * @return a variant context object */ - private VariantContext parseVCFLine(String[] parts) { + private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) { VariantContextBuilder builder = new VariantContextBuilder(); builder.source(getName()); @@ -315,8 +247,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec else builder.id(parts[2]); - String ref = getCachedString(parts[3].toUpperCase()); - String alts = getCachedString(parts[4].toUpperCase()); + final String ref = getCachedString(parts[3].toUpperCase()); + final String alts = getCachedString(parts[4].toUpperCase()); builder.log10PError(parseQual(parts[5])); final List filters = parseFilters(getCachedString(parts[6])); @@ -325,33 +257,11 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec builder.attributes(attrs); // get our alleles, filters, and setup an attribute map - List alleles = parseAlleles(ref, alts, lineNo); - - // find out our current location, and clip the alleles down to their minimum length - int stop = pos; - // ref alleles don't need to be single bases for monomorphic sites - if ( alleles.size() == 1 ) { - stop = pos + alleles.get(0).length() - 1; - } - // we need to parse the INFO field to check for an END tag if it's a symbolic allele - else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() && attrs.containsKey(VCFConstants.END_KEY) ) { - try { - stop = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()); - } catch (Exception e) { - generateException("the END value in the INFO field is not valid"); - } - } else { - final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(pos, ref, alleles); - if ( clipped.getError() != null ) - generateException(clipped.getError(), lineNo); - stop = clipped.getStop(); - alleles = clipped.clippedAlleles; - } - builder.stop(stop); - builder.alleles(alleles); + final List rawAlleles = parseAlleles(ref, alts, lineNo); + final List alleles = updateBuilderAllelesAndStop(builder, ref, pos, rawAlleles, attrs); // do we have genotyping data - if (parts.length > NUM_STANDARD_FIELDS) { + if (parts.length > NUM_STANDARD_FIELDS && includeGenotypes) { final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); final int nGenotypes = header.getNGenotypeSamples(); LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); @@ -374,6 +284,31 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec return vc; } + private final List updateBuilderAllelesAndStop(final VariantContextBuilder builder, + final String ref, + final int pos, + final List rawAlleles, + final Map attrs) { + int endForSymbolicAlleles = pos; // by default we use the pos + if ( attrs.containsKey(VCFConstants.END_KEY) ) { + // update stop with the end key if provided + try { + endForSymbolicAlleles = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()); + } catch (Exception e) { + generateException("the END value in the INFO field is not valid"); + } + } + + // find out our current location, and clip the alleles down to their minimum length + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(pos, ref, rawAlleles, endForSymbolicAlleles); + if ( clipped.getError() != null ) + generateException(clipped.getError(), lineNo); + + builder.stop(clipped.getStop()); + builder.alleles(clipped.getClippedAlleles()); + return clipped.getClippedAlleles(); + } + /** * get the name of this codec * @return our set name @@ -761,4 +696,13 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec public final void disableOnTheFlyModifications() { doOnTheFlyModifications = false; } + + + protected void generateException(String message) { + throw new UserException.MalformedVCF(message, lineNo); + } + + protected static void generateException(String message, int lineNo) { + throw new UserException.MalformedVCF(message, lineNo); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java index 7f11f8784..40ba23d9d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -46,47 +47,52 @@ import java.util.*; * VCF files. * * TODO rethink this class, make it clean, and make it easy to create, mix, and write out alleles + * TODO this code doesn't work with reverse clipped alleles (ATA / GTTA -> AT / GT) * * @author Mark DePristo * @since 6/12 */ public final class VCFAlleleClipper { - // TODO - // TODO - // TODO - // TODO I have honestly no idea what this code is really supposed to do - // TODO computeForwardClipping is called in two places: - // TODO - // TODO clipAlleles() where all alleles, including ref, are passed in - // TODO createVariantContextWithTrimmedAlleles() where only the alt alleles are passed in - // TODO - // TODO The problem is that the code allows us to clip - // TODO - // TODO - // TODO - // TODO - // TODO - // TODO - @Ensures("result == 0 || result == 1") - public static int computeForwardClipping(final List unclippedAlleles, - final byte ref0) { - boolean clipping = true; - int symbolicAlleleCount = 0; + private VCFAlleleClipper() { } + + /** + * Determine whether we should clip off the first base of all unclippped alleles or not + * + * Returns true if all of the alleles in unclippedAlleles share a common first base with + * ref0. Ref0 should be the first base of the reference allele UnclippedAlleles may + * contain the reference allele itself, or just the alternate alleles, it doesn't matter. + * + * The algorithm returns true if the first base should be clipped off, or false otherwise + * + * This algorithm works even in the presence of symbolic alleles, logically ignoring these + * values. It + * + * @param unclippedAlleles list of unclipped alleles to assay + * @param ref0 the first base of the reference allele + * @return true if we should clip the first base of unclippedAlleles + */ + @Requires("unclippedAlleles != null") + public static boolean shouldClipFirstBaseP(final List unclippedAlleles, + final byte ref0) { + boolean allSymbolicAlt = true; for ( final Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) { - symbolicAlleleCount++; continue; } + // already know we aren't symbolic, so we only need to decide if we have only seen a ref + if ( ! a.isReference() ) + allSymbolicAlt = false; + if ( a.length() < 1 || (a.getBases()[0] != ref0) ) { - clipping = false; - break; + return false; } } - // don't clip if all alleles are symbolic - return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; + // to reach here all alleles are consistent with clipping the first base matching ref0 + // but we don't clip if all ALT alleles are symbolic + return ! allSymbolicAlt; } public static int computeReverseClipping(final List unclippedAlleles, @@ -174,48 +180,64 @@ public final class VCFAlleleClipper { * @param unclippedAlleles the list of unclipped alleles, including the reference allele * @return the new reference end position of this event */ - @Requires({"position > 0", "ref != null && ref.length() > 0", "!unclippedAlleles.isEmpty()"}) + @Requires({"start > 0", "ref != null && ref.length() > 0", "!unclippedAlleles.isEmpty()"}) @Ensures("result != null") public static ClippedAlleles clipAlleles(final int start, final String ref, - final List unclippedAlleles) { + final List unclippedAlleles, + final int endForSymbolicAllele ) { // no variation or single nucleotide events are by definition fully clipped if ( unclippedAlleles.size() == 1 || isSingleNucleotideEvent(unclippedAlleles) ) - return new ClippedAlleles(start, unclippedAlleles); - - // symbolic alleles aren't unclipped by default, and there can be only two (better than highlander style) - if ( unclippedAlleles.size() > 1 ) { - if ( unclippedAlleles.get(1).isSymbolic() ) { - if ( unclippedAlleles.size() > 2 ) - return new ClippedAlleles("Detected multiple alleles at a site, including a symbolic one, which is currently unsupported"); - // we do not unclip symbolic alleles - return new ClippedAlleles(start, unclippedAlleles); - } - } + return new ClippedAlleles(start, unclippedAlleles, null); // we've got to sort out the clipping by looking at the alleles themselves - final int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); + final byte firstRefBase = (byte) ref.charAt(0); + final boolean firstBaseIsClipped = shouldClipFirstBaseP(unclippedAlleles, firstRefBase); + final int forwardClipping = firstBaseIsClipped ? 1 : 0; final int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false); + final boolean needsClipping = forwardClipping > 0 || reverseClipping > 0; if ( reverseClipping == -1 ) return new ClippedAlleles("computeReverseClipping failed due to bad alleles"); - final List clippedAlleles = new ArrayList(unclippedAlleles.size()); - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) { - clippedAlleles.add(a); - } else { - final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length - reverseClipping); - if ( !Allele.acceptableAlleleBases(allele) ) - return new ClippedAlleles("Unparsable vcf record with bad allele [" + allele + "]"); - clippedAlleles.add(Allele.create(allele, a.isReference())); + boolean sawSymbolic = false; + List clippedAlleles; + if ( ! needsClipping ) { + // there's nothing to clip, so clippedAlleles are the original alleles + clippedAlleles = unclippedAlleles; + } else { + clippedAlleles = new ArrayList(unclippedAlleles.size()); + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + sawSymbolic = true; + clippedAlleles.add(a); + } else { + final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length - reverseClipping); + if ( !Allele.acceptableAlleleBases(allele) ) + return new ClippedAlleles("Unparsable vcf record with bad allele [" + allele + "]"); + clippedAlleles.add(Allele.create(allele, a.isReference())); + } } } - // the new reference length - final int refLength = ref.length() - reverseClipping; - final int stop = start + Math.max(refLength - 1, 0); - return new ClippedAlleles(stop, clippedAlleles); + int stop = VariantContextUtils.computeEndFromAlleles(clippedAlleles, start, endForSymbolicAllele); + + // TODO + // TODO + // TODO COMPLETELY BROKEN CODE -- THE GATK CURRENTLY ENCODES THE STOP POSITION FOR CLIPPED ALLELES AS + 1 + // TODO ITS TRUE SIZE TO DIFFERENTIATE CLIPPED VS. UNCLIPPED ALLELES. NEEDS TO BE FIXED + // TODO + // TODO + if ( needsClipping && ! sawSymbolic && ! clippedAlleles.get(0).isNull() ) stop++; + // TODO + // TODO + // TODO COMPLETELY BROKEN CODE -- THE GATK CURRENTLY ENCODES THE STOP POSITION FOR CLIPPED ALLELES AS + 1 + // TODO ITS TRUE SIZE TO DIFFERENTIATE CLIPPED VS. UNCLIPPED ALLELES. NEEDS TO BE FIXED + // TODO + // TODO + + final Byte refBaseForIndel = firstBaseIsClipped ? firstRefBase : null; + return new ClippedAlleles(stop, clippedAlleles, refBaseForIndel); } /** @@ -243,8 +265,9 @@ public final class VCFAlleleClipper { else if ( !inputVC.hasSymbolicAlleles() ) throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - else - return false; + else if ( inputVC.isMixed() && inputVC.hasSymbolicAlleles() ) + throw new IllegalArgumentException("GATK infrastructure limitation prevents needsPadding from working properly with VariantContexts containing a mixture of symbolic and concrete alleles at " + inputVC); + return false; } public static Allele padAllele(final VariantContext vc, final Allele allele) { @@ -271,26 +294,41 @@ public final class VCFAlleleClipper { throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); final ArrayList alleles = new ArrayList(inputVC.getNAlleles()); - final Map unpaddedToPadded = new HashMap(inputVC.getNAlleles()); + final Map unpaddedToPadded = inputVC.hasGenotypes() ? new HashMap(inputVC.getNAlleles()) : null; + boolean paddedAtLeastOne = false; for (final Allele a : inputVC.getAlleles()) { final Allele padded = padAllele(inputVC, a); + paddedAtLeastOne = paddedAtLeastOne || padded != a; alleles.add(padded); - unpaddedToPadded.put(a, padded); + if ( unpaddedToPadded != null ) unpaddedToPadded.put(a, padded); // conditional to avoid making unnecessary make } - // now we can recreate new genotypes with trimmed alleles - GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); - for (final Genotype g : inputVC.getGenotypes() ) { - final List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); - for (final Allele a : g.getAlleles()) { - newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL); + if ( ! paddedAtLeastOne ) + throw new ReviewedStingException("VC was supposed to need padding but no allele was actually changed at location " + inputVC.getChr() + ":" + inputVC.getStart() + " with allele " + inputVC.getAlleles()); + + final VariantContextBuilder vcb = new VariantContextBuilder(inputVC); + vcb.alleles(alleles); + + // the position of the inputVC is one further, if it doesn't contain symbolic alleles + vcb.computeEndFromAlleles(alleles, inputVC.getStart(), inputVC.getEnd()); + + if ( inputVC.hasGenotypes() ) { + assert unpaddedToPadded != null; + + // now we can recreate new genotypes with trimmed alleles + final GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); + for (final Genotype g : inputVC.getGenotypes() ) { + final List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); + for (final Allele a : g.getAlleles()) { + newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL); + } + genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make()); } - genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make()); - + vcb.genotypes(genotypes); } - return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); + return vcb.make(); } else return inputVC; @@ -337,33 +375,60 @@ public final class VCFAlleleClipper { return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make(); } + @Invariant("stop != -1 || error != null") // we're either an error or a meaningful result but not both public static class ClippedAlleles { - final int stop; - final List clippedAlleles; - final String error; + private final int stop; + private final List clippedAlleles; + private final Byte refBaseForIndel; + private final String error; - public ClippedAlleles(final int stop, final List clippedAlleles) { + @Requires({"stop > 0", "clippedAlleles != null"}) + private ClippedAlleles(final int stop, final List clippedAlleles, final Byte refBaseForIndel) { this.stop = stop; this.clippedAlleles = clippedAlleles; this.error = null; + this.refBaseForIndel = refBaseForIndel; } - public ClippedAlleles(final String error) { + @Requires("error != null") + private ClippedAlleles(final String error) { this.stop = -1; this.clippedAlleles = null; + this.refBaseForIndel = null; this.error = error; } + /** + * Get an error if it occurred + * @return the error message, or null if no error occurred + */ public String getError() { return error; } + /** + * Get the stop position to use after the clipping as been applied, given the + * provided position to clipAlleles + * @return + */ public int getStop() { return stop; } + /** + * Get the clipped alleles themselves + * @return the clipped alleles in the order of the input unclipped alleles + */ public List getClippedAlleles() { return clippedAlleles; } + + /** + * Returns the reference base we should use for indels, or null if none is appropriate + * @return + */ + public Byte getRefBaseForIndel() { + return refBaseForIndel; + } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 5cd4766c6..c62afd284 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1176,7 +1176,7 @@ public class VariantContext implements Feature { // to enable tribble integratio // if ( getType() == Type.INDEL ) { // if ( getReference().length() != (getLocation().size()-1) ) { long length = (stop - start) + 1; - if ( ! isSymbolic() + if ( ! hasSymbolicAlleles() && ((getReference().isNull() && length != 1 ) || (getReference().isNonNull() && (length - getReference().length() > 1)))) { throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); @@ -1477,7 +1477,11 @@ public class VariantContext implements Feature { // to enable tribble integratio } public boolean hasSymbolicAlleles() { - for (final Allele a: getAlleles()) { + return hasSymbolicAlleles(getAlleles()); + } + + public static boolean hasSymbolicAlleles( final List alleles ) { + for ( final Allele a: alleles ) { if (a.isSymbolic()) { return true; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index 01d3ab456..0590329c4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -454,6 +454,32 @@ public class VariantContextBuilder { return this; } + /** + * @see #computeEndFromAlleles(java.util.List, int, int) with endForSymbolicAlleles == -1 + */ + public VariantContextBuilder computeEndFromAlleles(final List alleles, final int start) { + return computeEndFromAlleles(alleles, start, -1); + } + + /** + * Compute the end position for this VariantContext from the alleles themselves + * + * @see VariantContextUtils.computeEndFromAlleles() + * + * assigns this builder the stop position computed. + * + * @param alleles the list of alleles to consider. The reference allele must be the first one + * @param start the known start position of this event + * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 + * if no is expected but will throw an error if one is found + * @return this builder + */ + @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) + public VariantContextBuilder computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { + stop(VariantContextUtils.computeEndFromAlleles(alleles, start, endForSymbolicAlleles)); + return this; + } + /** * @return true if this builder contains fully decoded data * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 33f5a3759..f1bebaa30 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -705,7 +705,7 @@ public class VariantContextUtils { else if (refAllele.isNull()) trimVC = false; else { - trimVC = (VCFAlleleClipper.computeForwardClipping(inputVC.getAlternateAlleles(), (byte) inputVC.getReference().getDisplayString().charAt(0)) > 0); + trimVC = VCFAlleleClipper.shouldClipFirstBaseP(inputVC.getAlternateAlleles(), (byte) inputVC.getReference().getDisplayString().charAt(0)); } // nothing to do if we don't need to trim bases @@ -1370,4 +1370,33 @@ public class VariantContextUtils { return true; // we passed all tests, we matched } + + /** + * Compute the end position for this VariantContext from the alleles themselves + * + * In the trivial case this is a single BP event and end = start (open intervals) + * In general the end is start + ref length - 1, handling the case where ref length == 0 + * However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases + * + * @param alleles the list of alleles to consider. The reference allele must be the first one + * @param start the known start position of this event + * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 + * if no is expected but will throw an error if one is found + * @return this builder + */ + @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) + public static int computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { + final Allele ref = alleles.get(0); + + if ( ref.isNonReference() ) + throw new ReviewedStingException("computeEndFromAlleles requires first allele to be reference"); + + if ( VariantContext.hasSymbolicAlleles(alleles) ) { + if ( endForSymbolicAlleles == -1 ) + throw new ReviewedStingException("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided"); + return endForSymbolicAlleles; + } else { + return start + Math.max(ref.length() - 1, 0); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 108bc42f7..e281475f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -266,6 +266,8 @@ class BCF2Writer extends IndexingVariantContextWriter { if ( needsPadding ) allele = VCFAlleleClipper.padAllele(vc, allele); final byte[] s = allele.getDisplayBases(); + if ( s == null ) + throw new ReviewedStingException("BUG: BCF2Writer encountered null padded allele" + allele); encoder.encodeTypedString(s); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index c5ca22242..6d37ae2de 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -44,11 +44,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "a10a0a20c6402207a2d968113595fde8"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "c1b89186cdfa48b26865bf9737433e1c"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "359a7da40e4803fc5e43e0e5211ef013"); + DTTest("testMultiSample ", "-I " + multiSample, "644e586e53c6f9bfd8e51e24f78da7b3"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index d16d981ac..eaed969a4 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -65,6 +65,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testApplyRecalibration-"+params.inVCF, spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java similarity index 66% rename from public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java index 55139e9e5..8cd051e01 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java @@ -27,12 +27,13 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.variantcontext.*; import org.testng.Assert; +import org.testng.SkipException; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; -public class VCFUtilsUnitTest extends BaseTest { +public class VCFAlleleClipperUnitTest extends BaseTest { // -------------------------------------------------------------------------------- // // Test allele clipping @@ -67,6 +68,15 @@ public class VCFUtilsUnitTest extends BaseTest { } } + public boolean isClipped() { + for ( int i = 0; i < inputs.size(); i++ ) { + if ( inputs.get(i).length() != expected.get(i).length() ) + return true; + } + + return false; + } + public String toString() { return String.format("ClipAllelesTest input=%s expected=%s", inputs, expected); } @@ -102,12 +112,30 @@ public class VCFUtilsUnitTest extends BaseTest { new ClipAllelesTest(10, 11, "AC", "CT", "GG", "AC", "CT", "GG"); // symbolic - new ClipAllelesTest(10, 10, "A", "", "A", ""); - new ClipAllelesTest(50, 50, "G", "G]22:60]", "A", "G]22:60]"); - new ClipAllelesTest(51, 51, "T", "]22:55]T", "A", "]22:55]T"); - new ClipAllelesTest(52, 52, "C", "C[22:51[", "A", "C[22:51["); + new ClipAllelesTest(10, 100, "A", "", "A", ""); + new ClipAllelesTest(50, 50, "G", "G]22:60]", "G", "G]22:60]"); + new ClipAllelesTest(51, 51, "T", "]22:55]T", "T", "]22:55]T"); + new ClipAllelesTest(52, 52, "C", "C[22:51[", "C", "C[22:51["); new ClipAllelesTest(60, 60, "A", "A]22:50]", "A", "A]22:50]"); + // symbolic with alleles that should be clipped + new ClipAllelesTest(10, 100, "A", "", "AA", "-", "", "A"); + new ClipAllelesTest(10, 100, "AA", "", "A", "A", "", "-"); + new ClipAllelesTest(10, 100, "AA", "", "A", "AAA", "A", "", "-", "AA"); + new ClipAllelesTest(10, 100, "AG", "", "A", "AGA", "G", "", "-", "GA"); + new ClipAllelesTest(10, 100, "G", "", "A", "G", "", "A"); + + // clipping from both ends + // + // TODO -- THIS CODE IS BROKEN BECAUSE CLIPPING DOES WORK WITH ALLELES CLIPPED FROM THE END + // +// new ClipAllelesTest(10, 10, "ATA", "ATTA", "-", "T"); +// new ClipAllelesTest(10, 10, "ATAA", "ATTAA", "-", "T"); +// new ClipAllelesTest(10, 10, "ATAAG", "ATTAAG", "-", "T"); +// new ClipAllelesTest(10, 11, "GTA", "ATTA", "G", "AT"); +// new ClipAllelesTest(10, 11, "GTAA", "ATTAA", "G", "AT"); +// new ClipAllelesTest(10, 11, "GTAAG", "ATTAAG", "G", "AT"); + // complex substitutions new ClipAllelesTest(10, 10, "A", "GA", "A", "GA"); @@ -116,12 +144,37 @@ public class VCFUtilsUnitTest extends BaseTest { @Test(dataProvider = "ClipAllelesTest") public void testClipAllelesTest(ClipAllelesTest cfg) { - final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs); + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs, cfg.stop); Assert.assertNull(clipped.getError(), "Unexpected error occurred"); Assert.assertEquals(clipped.getStop(), cfg.stop, "Clipped alleles stop"); Assert.assertEquals(clipped.getClippedAlleles(), cfg.expected, "Clipped alleles"); } + @Test(dataProvider = "ClipAllelesTest", dependsOnMethods = "testClipAllelesTest") + public void testPaddingAllelesInVC(final ClipAllelesTest cfg) { + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs, cfg.stop); + final VariantContext vc = new VariantContextBuilder("x", "1", cfg.position, cfg.stop, clipped.getClippedAlleles()) + .referenceBaseForIndel(clipped.getRefBaseForIndel()).make(); + + if ( vc.isMixed() && vc.hasSymbolicAlleles() ) + throw new SkipException("GATK cannot handle mixed variant contexts with symbolic and concrete alleles. Remove this check when allele clipping and padding is generalized"); + + Assert.assertEquals(VCFAlleleClipper.needsPadding(vc), cfg.isClipped(), "needPadding method"); + + if ( cfg.isClipped() ) { + // TODO + // TODO note that the GATK currently uses a broken approach to the clipped alleles, so the expected stop is + // TODO actually the original stop, as the original stop is +1 its true size. + // TODO + final int expectedStop = vc.getEnd(); // + (vc.hasSymbolicAlleles() ? 0 : 1); + + final VariantContext padded = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc); + Assert.assertEquals(padded.getStart(), vc.getStart(), "padded VC start"); + Assert.assertEquals(padded.getAlleles(), cfg.inputs, "padded VC alleles == original unclipped alleles"); + Assert.assertEquals(padded.getEnd(), expectedStop, "padded VC end should be clipped VC + 1 (added a base to ref allele)"); + Assert.assertFalse(VCFAlleleClipper.needsPadding(padded), "padded VC shouldn't need padding again"); + } + } // -------------------------------------------------------------------------------- // From 385a3c630fffcbb2639d8f1e2b7da3dcd2832f02 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 29 Jun 2012 15:01:27 -0400 Subject: [PATCH 11/19] Added check in VariantContext.validate to ensure that getEnd() == END value when present -- Fixed bug in VariantDataManager that this validation mode was intended to detect going forward -- Still no VariantRecalibrationWalkersIntegrationTest for indels with BCF2 but that's because LowQual is missing from test VCF --- .../variantrecalibration/VariantDataManager.java | 2 +- .../utils/variantcontext/VariantContext.java | 15 +++++++++++++++ ...ariantRecalibrationWalkersIntegrationTest.java | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 4ccaea932..d6df4ff1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -298,7 +298,7 @@ public class VariantDataManager { attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); - VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStart(), alleles).attributes(attributes); + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes); recalWriter.add(builder.make()); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index c62afd284..1f0b2b054 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1126,6 +1126,7 @@ public class VariantContext implements Feature { // to enable tribble integratio // --------------------------------------------------------------------------------------------------------- private boolean validate(final EnumSet validationToPerform) { + validateStop(); for (final Validation val : validationToPerform ) { switch (val) { case ALLELES: validateAlleles(); break; @@ -1138,6 +1139,20 @@ public class VariantContext implements Feature { // to enable tribble integratio return true; } + /** + * Check that getEnd() == END from the info field, if it's present + */ + private void validateStop() { + if ( hasAttribute(VCFConstants.END_KEY) ) { + final int end = getAttributeAsInt(VCFConstants.END_KEY, -1); + assert end != -1; + if ( end != getEnd() ) + throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" + + getStart() + "; getEnd() was " + getEnd() + + " but this VariantContext contains an END key with value " + end); + } + } + private void validateReferencePadding() { if ( hasSymbolicAlleles() ) // symbolic alleles don't need padding... return; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index eaed969a4..c075bb390 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -114,6 +114,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); + spec.disableShadowBCF(); // has to be disabled because the input VCF is missing LowQual annotation executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); } From 5ad9a98a15f489c0640eeaa0c76608663258d564 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 30 Jun 2012 11:22:27 -0400 Subject: [PATCH 13/19] Minor bugfixes / consistency fixes to filter strings of Genotypes and AC/AF annotations -- GenotypeBuilder now sorts the list of filter strings so that the output is in a consistent order -- calculateChromosomeCounts removes the AC/AF fields entirely when there are no alt alleles, to be on VCF spec for A defined info field values --- .../sting/utils/variantcontext/GenotypeBuilder.java | 2 +- .../sting/utils/variantcontext/VariantContextUtils.java | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 81d714294..e3bef6bc5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -382,7 +382,7 @@ public final class GenotypeBuilder { else if ( filters.size() == 1 ) return filter(filters.get(0)); else - return filter(ParsingUtils.join(";", filters)); + return filter(ParsingUtils.join(";", ParsingUtils.sortList(filters))); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index f1bebaa30..843838972 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -147,6 +147,10 @@ public class VariantContextUtils { attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + } else { + // if there's no alt AC and AF shouldn't be present + attributes.remove(VCFConstants.ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); } } From 9b87dcda4fbb3b3337cf108d56332347836caba7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 30 Jun 2012 16:23:11 -0400 Subject: [PATCH 19/19] Fixing remaining integration test errors. Adding missing ex2.bcf --- .../sting/gatk/walkers/beagle/BeagleIntegrationTest.java | 2 +- .../diagnostics/targets/DiagnoseTargetsIntegrationTest.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 8fe96b53d..869011a99 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -75,7 +75,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("fbbbebfda35bab3f6f62eea2f0be1c01")); + "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("d8906b67c7f9fdb5b37b8e9e050982d3")); spec.disableShadowBCF(); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index 6d37ae2de..55b04f1b8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -44,11 +44,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "c1b89186cdfa48b26865bf9737433e1c"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9954b21163d3e66db232938ec509067f"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "644e586e53c6f9bfd8e51e24f78da7b3"); + DTTest("testMultiSample ", "-I " + multiSample, "7c5277261e8e9dd74666f04843ffb09c"); } }