diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index b4cf96831..e3f9b8ee3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -116,6 +116,7 @@ public class VCFDiffableReader implements DiffableReader { if ( g.hasDP() ) gRoot.add("DP", g.getDP() ); if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD())); if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL())); + if ( g.getFilters() != null ) gRoot.add("FT", g.getFilters()); for (Map.Entry attribute : g.getExtendedAttributes().entrySet()) { if ( ! attribute.getKey().startsWith("_") ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 71352bddd..1d85abbcd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -297,7 +297,8 @@ public class VariantFiltrationWalker extends RodWalker { // for each genotype, check filters then create a new object for ( final Genotype g : vc.getGenotypes() ) { if ( g.isCalled() ) { - List filters = new ArrayList(g.getFilters()); + final List filters = new ArrayList(); + if ( g.isFiltered() ) filters.add(g.getFilters()); for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { if ( VariantContextUtils.match(vc, g, exp) ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 96cf5e8c5..230d6c324 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -186,7 +186,10 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; - haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), + if (hsize <= 0) // protect against event lengths larger than ref window sizes + haplotypeMap.clear(); + else + haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), ref, hsize, numPrefBases)); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 3c32d132f..7a5a1ba0b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.codecs.vcf.VCFAlleleClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -188,7 +189,7 @@ public class UnifiedGenotyperEngine { else { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); if ( vc != null ) - results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model)); + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true)); } } } @@ -309,6 +310,22 @@ public class UnifiedGenotyperEngine { } public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false); + } + + /** + * Main entry function to calculate genotypes of a given VC with corresponding GL's + * @param tracker Tracker + * @param refContext Reference context + * @param rawContext Raw context + * @param stratifiedContexts Stratified alignment contexts + * @param vc Input VC + * @param model GL calculation model + * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc + * @return VC with assigned genotypes + */ + public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, + final boolean inheritAttributesFromInputVC) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; @@ -408,6 +425,9 @@ public class UnifiedGenotyperEngine { // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); + // inherit attributed from input vc if requested + if (inheritAttributesFromInputVC) + attributes.putAll(vc.getAttributes()); // if the site was downsampled, record that fact if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); @@ -474,7 +494,7 @@ public class UnifiedGenotyperEngine { // if we are subsetting alleles (either because there were too many or because some were not polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed - vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); + vcCall = VCFAlleleClipper.reverseTrimAlleles(vcCall); if ( annotationEngine != null && !limitedContext ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations @@ -622,6 +642,9 @@ public class UnifiedGenotyperEngine { final AlignmentContext rawContext) { final List models = new ArrayList(2); + String modelPrefix = ""; + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) + modelPrefix = UAC.GLmodel.name().toUpperCase().replaceAll("BOTH",""); // if we're genotyping given alleles and we have a requested SNP at this position, do SNP if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { @@ -631,24 +654,24 @@ public class UnifiedGenotyperEngine { if ( vcInput.isSNP() ) { // ignore SNPs if the user chose INDEL mode only - if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) - models.add(GenotypeLikelihoodsCalculationModel.Model.SNP); + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); else if ( UAC.GLmodel.name().toUpperCase().contains("SNP") ) models.add(UAC.GLmodel); } else if ( vcInput.isIndel() || vcInput.isMixed() ) { // ignore INDELs if the user chose SNP mode only - if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) - models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL); + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); else if (UAC.GLmodel.name().toUpperCase().contains("INDEL")) models.add(UAC.GLmodel); } // No support for other types yet } else { - if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) { - models.add(GenotypeLikelihoodsCalculationModel.Model.SNP); - models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL); + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); } else { models.add(UAC.GLmodel); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 4ccaea932..d6df4ff1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -298,7 +298,7 @@ public class VariantDataManager { attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); - VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStart(), alleles).attributes(attributes); + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes); recalWriter.add(builder.make()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index a3e1c0bd7..21965afcd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -129,7 +129,7 @@ public class LiftoverVariants extends RodWalker { .attribute("OriginalStart", fromInterval.getStart()).make(); } - VariantContext newVC = VariantContextUtils.createVariantContextWithPaddedAlleles(vc); + VariantContext newVC = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc); if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 94c24b097..e106e72ac 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -51,6 +51,8 @@ import java.util.*; */ public final class BCF2Codec implements FeatureCodec, ReferenceDependentFeatureCodec { final protected static Logger logger = Logger.getLogger(BCF2Codec.class); + private final static boolean FORBID_SYMBOLICS = false; + private VCFHeader header = null; /** @@ -270,7 +272,7 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD " samples in header but have a record with " + nSamples + " samples"); decodeID(builder); - final ArrayList alleles = decodeAlleles(builder, pos, nAlleles); + final List alleles = decodeAlleles(builder, pos, nAlleles); decodeFilter(builder); decodeInfo(builder, nInfo); @@ -283,9 +285,9 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD protected final static class SitesInfoForDecoding { final int nFormatFields; final int nSamples; - final ArrayList alleles; + final List alleles; - private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList alleles) { + private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final List alleles) { this.nFormatFields = nFormatFields; this.nSamples = nSamples; this.alleles = alleles; @@ -325,13 +327,16 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD * @param unclippedAlleles * @return */ - protected static ArrayList clipAllelesIfNecessary(int position, String ref, ArrayList unclippedAlleles) { - if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) { - final ArrayList clippedAlleles = new ArrayList(unclippedAlleles.size()); - AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1); - return clippedAlleles; - } else - return unclippedAlleles; + @Requires({"position > 0", "ref != null && ref.length() > 0", "! unclippedAlleles.isEmpty()"}) + @Ensures("result.size() == unclippedAlleles.size()") + protected List clipAllelesIfNecessary(final int position, + final String ref, + final List unclippedAlleles) { + // the last argument of 1 allows us to safely ignore the end, because we are + // ultimately going to use the end in the record itself + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(position, ref, unclippedAlleles, 1); + if ( clipped.getError() != null ) error(clipped.getError()); + return clipped.getClippedAlleles(); } /** @@ -342,9 +347,9 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD * @return the alleles */ @Requires("nAlleles > 0") - private ArrayList decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) { + private List decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) { // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes - ArrayList alleles = new ArrayList(nAlleles); + List alleles = new ArrayList(nAlleles); String ref = null; for ( int i = 0; i < nAlleles; i++ ) { @@ -356,7 +361,7 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD alleles.add(allele); - if ( allele.isSymbolic() ) + if ( FORBID_SYMBOLICS && allele.isSymbolic() ) throw new ReviewedStingException("LIMITATION: GATK BCF2 codec does not yet support symbolic alleles"); } assert ref != null; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java index 59537a329..0dadc49f9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java @@ -276,9 +276,8 @@ public class BCF2GenotypeFieldDecoders { public void decode(final List siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final int numElements, final GenotypeBuilder[] gbs) { for ( final GenotypeBuilder gb : gbs ) { Object value = decoder.decodeTypedValue(typeDescriptor, numElements); - if ( value != null ) { // don't add missing values - gb.filters(value instanceof String ? Collections.singletonList((String)value) : (List)value); - } + assert value == null || value instanceof String; + gb.filter((String)value); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index c749325fb..35fb2e97a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -44,13 +44,13 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { // initialized when this lazy decoder is created, as we know all of this from the BCF2Codec // and its stored here again for code cleanliness private final BCF2Codec codec; - private final ArrayList siteAlleles; + private final List siteAlleles; private final int nSamples; private final int nFields; private final GenotypeBuilder[] builders; @Requires("codec.getHeader().getNGenotypeSamples() == builders.length") - BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList alleles, final int nSamples, + BCF2LazyGenotypesDecoder(final BCF2Codec codec, final List alleles, final int nSamples, final int nFields, final GenotypeBuilder[] builders) { this.codec = codec; this.siteAlleles = alleles; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index f9f310538..1d5869349 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -186,81 +186,19 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - - // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line - if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; - - // our header cannot be null, we need the genotype sample names and counts - if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); - - final int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - - if ( nParts != 6 ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); - - // get our alleles (because the end position depends on them) - final String ref = getCachedString(locParts[3].toUpperCase()); - final String alts = getCachedString(locParts[4].toUpperCase()); - final List alleles = parseAlleles(ref, alts, lineNo); - - // find out our location - int start = 0; - try { - start = Integer.valueOf(locParts[1]); - } catch (Exception e) { - generateException("the value in the POS field must be an integer but it was " + locParts[1], lineNo); - } - int stop = start; - - // ref alleles don't need to be single bases for monomorphic sites - if ( alleles.size() == 1 ) { - stop = start + alleles.get(0).length() - 1; - } - // we need to parse the INFO field to check for an END tag if it's a symbolic allele - else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() ) { - final String[] extraParts = new String[4]; - final int nExtraParts = ParsingUtils.split(locParts[5], extraParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); - if ( nExtraParts < 3 ) - throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); - - final Map attrs = parseInfo(extraParts[2]); - try { - stop = attrs.containsKey(VCFConstants.END_KEY) ? Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()) : start; - } catch (Exception e) { - throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo); - } - } - // handle multi-positional events - else if ( !isSingleNucleotideEvent(alleles) ) { - stop = clipAlleles(start, ref, alleles, null, lineNo); - } - - return new VCFLocFeature(locParts[0], start, stop); + return decodeLine(line, false); } - private final static class VCFLocFeature implements Feature { - - final String chr; - final int start, stop; - - private VCFLocFeature(String chr, int start, int stop) { - this.chr = chr; - this.start = start; - this.stop = stop; - } - - public String getChr() { return chr; } - public int getStart() { return start; } - public int getEnd() { return stop; } - } - - /** * decode the line into a feature (VariantContext) * @param line the line * @return a VariantContext */ public VariantContext decode(String line) { + return decodeLine(line, true); + } + + private final VariantContext decodeLine(final String line, final boolean includeGenotypes) { // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; @@ -278,15 +216,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + " tokens, and saw " + nParts + " )", lineNo); - return parseVCFLine(parts); - } - - protected void generateException(String message) { - throw new UserException.MalformedVCF(message, lineNo); - } - - protected static void generateException(String message, int lineNo) { - throw new UserException.MalformedVCF(message, lineNo); + return parseVCFLine(parts, includeGenotypes); } /** @@ -295,7 +225,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec * @param parts the parts split up * @return a variant context object */ - private VariantContext parseVCFLine(String[] parts) { + private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) { VariantContextBuilder builder = new VariantContextBuilder(); builder.source(getName()); @@ -317,8 +247,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec else builder.id(parts[2]); - String ref = getCachedString(parts[3].toUpperCase()); - String alts = getCachedString(parts[4].toUpperCase()); + final String ref = getCachedString(parts[3].toUpperCase()); + final String alts = getCachedString(parts[4].toUpperCase()); builder.log10PError(parseQual(parts[5])); final List filters = parseFilters(getCachedString(parts[6])); @@ -327,31 +257,11 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec builder.attributes(attrs); // get our alleles, filters, and setup an attribute map - List alleles = parseAlleles(ref, alts, lineNo); - - // find out our current location, and clip the alleles down to their minimum length - int stop = pos; - // ref alleles don't need to be single bases for monomorphic sites - if ( alleles.size() == 1 ) { - stop = pos + alleles.get(0).length() - 1; - } - // we need to parse the INFO field to check for an END tag if it's a symbolic allele - else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() && attrs.containsKey(VCFConstants.END_KEY) ) { - try { - stop = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()); - } catch (Exception e) { - generateException("the END value in the INFO field is not valid"); - } - } else if ( !isSingleNucleotideEvent(alleles) ) { - ArrayList newAlleles = new ArrayList(); - stop = clipAlleles(pos, ref, alleles, newAlleles, lineNo); - alleles = newAlleles; - } - builder.stop(stop); - builder.alleles(alleles); + final List rawAlleles = parseAlleles(ref, alts, lineNo); + final List alleles = updateBuilderAllelesAndStop(builder, ref, pos, rawAlleles, attrs); // do we have genotyping data - if (parts.length > NUM_STANDARD_FIELDS) { + if (parts.length > NUM_STANDARD_FIELDS && includeGenotypes) { final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); final int nGenotypes = header.getNGenotypeSamples(); LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); @@ -374,6 +284,31 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec return vc; } + private final List updateBuilderAllelesAndStop(final VariantContextBuilder builder, + final String ref, + final int pos, + final List rawAlleles, + final Map attrs) { + int endForSymbolicAlleles = pos; // by default we use the pos + if ( attrs.containsKey(VCFConstants.END_KEY) ) { + // update stop with the end key if provided + try { + endForSymbolicAlleles = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()); + } catch (Exception e) { + generateException("the END value in the INFO field is not valid"); + } + } + + // find out our current location, and clip the alleles down to their minimum length + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(pos, ref, rawAlleles, endForSymbolicAlleles); + if ( clipped.getError() != null ) + generateException(clipped.getError(), lineNo); + + builder.stop(clipped.getStop()); + builder.alleles(clipped.getClippedAlleles()); + return clipped.getClippedAlleles(); + } + /** * get the name of this codec * @return our set name @@ -613,102 +548,6 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec alleles.add(allele); } - public static boolean isSingleNucleotideEvent(List alleles) { - for ( Allele a : alleles ) { - if ( a.length() != 1 ) - return false; - } - return true; - } - - public static int computeForwardClipping(final List unclippedAlleles, final byte ref0) { - boolean clipping = true; - int symbolicAlleleCount = 0; - - for ( Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) { - symbolicAlleleCount++; - continue; - } - - if ( a.length() < 1 || (a.getBases()[0] != ref0) ) { - clipping = false; - break; - } - } - - // don't clip if all alleles are symbolic - return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; - } - - public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) { - int clipping = 0; - boolean stillClipping = true; - - while ( stillClipping ) { - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - continue; - - // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong - // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). - if ( a.length() - clipping == 0 ) - return clipping - (allowFullClip ? 0 : 1); - - if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { - stillClipping = false; - } - else if ( ref.length == clipping ) { - if ( allowFullClip ) - stillClipping = false; - else - generateException("bad alleles encountered", lineNo); - } - else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { - stillClipping = false; - } - } - if ( stillClipping ) - clipping++; - } - - return clipping; - } - - /** - * clip the alleles, based on the reference - * - * @param position the unadjusted start position (pre-clipping) - * @param ref the reference string - * @param unclippedAlleles the list of unclipped alleles - * @param clippedAlleles output list of clipped alleles - * @param lineNo the current line number in the file - * @return the new reference end position of this event - */ - public static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - - int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); - int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo); - - if ( clippedAlleles != null ) { - for ( Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) { - clippedAlleles.add(a); - } else { - final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping); - if ( !Allele.acceptableAlleleBases(allele) ) - generateException("Unparsable vcf record with bad allele [" + allele + "]", lineNo); - clippedAlleles.add(Allele.create(allele, a.isReference())); - } - } - } - - // the new reference length - int refLength = ref.length() - reverseClipping; - - return position+Math.max(refLength - 1,0); - } - public final static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) { try { return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || @@ -857,4 +696,13 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec public final void disableOnTheFlyModifications() { doOnTheFlyModifications = false; } + + + protected void generateException(String message) { + throw new UserException.MalformedVCF(message, lineNo); + } + + protected static void generateException(String message, int lineNo) { + throw new UserException.MalformedVCF(message, lineNo); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java new file mode 100644 index 000000000..40ba23d9d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipper.java @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.vcf; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +/** + * All of the gross allele clipping and padding routines in one place + * + * Having attempted to understand / fix / document this code myself + * I can only conclude that this entire approach needs to be rethought. This + * code just doesn't work robustly with symbolic alleles, with multiple alleles, + * requires a special "reference base for indels" stored in the VariantContext + * whose correctness isn't enforced, and overall has strange special cases + * all over the place. + * + * The reason this code is so complex is due to symbolics and multi-alleleic + * variation, which frequently occur when combining variants from multiple + * VCF files. + * + * TODO rethink this class, make it clean, and make it easy to create, mix, and write out alleles + * TODO this code doesn't work with reverse clipped alleles (ATA / GTTA -> AT / GT) + * + * @author Mark DePristo + * @since 6/12 + */ +public final class VCFAlleleClipper { + private VCFAlleleClipper() { } + + /** + * Determine whether we should clip off the first base of all unclippped alleles or not + * + * Returns true if all of the alleles in unclippedAlleles share a common first base with + * ref0. Ref0 should be the first base of the reference allele UnclippedAlleles may + * contain the reference allele itself, or just the alternate alleles, it doesn't matter. + * + * The algorithm returns true if the first base should be clipped off, or false otherwise + * + * This algorithm works even in the presence of symbolic alleles, logically ignoring these + * values. It + * + * @param unclippedAlleles list of unclipped alleles to assay + * @param ref0 the first base of the reference allele + * @return true if we should clip the first base of unclippedAlleles + */ + @Requires("unclippedAlleles != null") + public static boolean shouldClipFirstBaseP(final List unclippedAlleles, + final byte ref0) { + boolean allSymbolicAlt = true; + + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + continue; + } + + // already know we aren't symbolic, so we only need to decide if we have only seen a ref + if ( ! a.isReference() ) + allSymbolicAlt = false; + + if ( a.length() < 1 || (a.getBases()[0] != ref0) ) { + return false; + } + } + + // to reach here all alleles are consistent with clipping the first base matching ref0 + // but we don't clip if all ALT alleles are symbolic + return ! allSymbolicAlt; + } + + public static int computeReverseClipping(final List unclippedAlleles, + final byte[] ref, + final int forwardClipping, + final boolean allowFullClip) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - (allowFullClip ? 0 : 1); + + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { + stillClipping = false; + } + else if ( ref.length == clipping ) { + if ( allowFullClip ) + stillClipping = false; + else + return -1; + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { + stillClipping = false; + } + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } + + /** + * Are the alleles describing a polymorphism substitution one base for another? + * + * @param alleles a list of alleles, must not be empty + * @return Return true if the length of any allele in alleles isn't 1 + */ + @Requires("!alleles.isEmpty()") + private static boolean isSingleNucleotideEvent(final List alleles) { + for ( final Allele a : alleles ) { + if ( a.length() != 1 ) + return false; + } + return true; + } + + /** + * clip the alleles, based on the reference, returning a ClippedAlleles object describing what happened + * + * The ClippedAlleles object contains the implied stop position of the alleles, given the provided start + * position, after clipping. It also contains the list of alleles, in the same order as the provided + * unclipped ones, that are the fully clipped version of the input alleles. If an error occurs + * during this option the getError() function returns a string describing the problem (for use in parsers). + * + * The basic operation are: + * + * single allele + * => stop == start and clipped == unclipped + * any number of single nucleotide events + * => stop == start and clipped == unclipped + * two alleles, second being symbolic + * => stop == start and clipped == unclipped + * Note in this case that the STOP should be computed by other means (from END in VCF, for example) + * Note that if there's more than two alleles and the second is a symbolic the code produces an error + * Any other case: + * The alleles are trimmed of any sequence shared at the end of the alleles. If N bases + * are common then the alleles will all be at least N bases shorter. + * The stop position returned is the start position + the length of the + * reverse trimmed only reference allele - 1. + * If the alleles all share a single common starting sequence (just one base is considered) + * then the alleles have this leading common base removed as well. + * + * TODO This code is gross and brittle and needs to be rethought from scratch + * + * @param start the unadjusted start position (pre-clipping) + * @param ref the reference string + * @param unclippedAlleles the list of unclipped alleles, including the reference allele + * @return the new reference end position of this event + */ + @Requires({"start > 0", "ref != null && ref.length() > 0", "!unclippedAlleles.isEmpty()"}) + @Ensures("result != null") + public static ClippedAlleles clipAlleles(final int start, + final String ref, + final List unclippedAlleles, + final int endForSymbolicAllele ) { + // no variation or single nucleotide events are by definition fully clipped + if ( unclippedAlleles.size() == 1 || isSingleNucleotideEvent(unclippedAlleles) ) + return new ClippedAlleles(start, unclippedAlleles, null); + + // we've got to sort out the clipping by looking at the alleles themselves + final byte firstRefBase = (byte) ref.charAt(0); + final boolean firstBaseIsClipped = shouldClipFirstBaseP(unclippedAlleles, firstRefBase); + final int forwardClipping = firstBaseIsClipped ? 1 : 0; + final int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false); + final boolean needsClipping = forwardClipping > 0 || reverseClipping > 0; + + if ( reverseClipping == -1 ) + return new ClippedAlleles("computeReverseClipping failed due to bad alleles"); + + boolean sawSymbolic = false; + List clippedAlleles; + if ( ! needsClipping ) { + // there's nothing to clip, so clippedAlleles are the original alleles + clippedAlleles = unclippedAlleles; + } else { + clippedAlleles = new ArrayList(unclippedAlleles.size()); + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + sawSymbolic = true; + clippedAlleles.add(a); + } else { + final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length - reverseClipping); + if ( !Allele.acceptableAlleleBases(allele) ) + return new ClippedAlleles("Unparsable vcf record with bad allele [" + allele + "]"); + clippedAlleles.add(Allele.create(allele, a.isReference())); + } + } + } + + int stop = VariantContextUtils.computeEndFromAlleles(clippedAlleles, start, endForSymbolicAllele); + + // TODO + // TODO + // TODO COMPLETELY BROKEN CODE -- THE GATK CURRENTLY ENCODES THE STOP POSITION FOR CLIPPED ALLELES AS + 1 + // TODO ITS TRUE SIZE TO DIFFERENTIATE CLIPPED VS. UNCLIPPED ALLELES. NEEDS TO BE FIXED + // TODO + // TODO + if ( needsClipping && ! sawSymbolic && ! clippedAlleles.get(0).isNull() ) stop++; + // TODO + // TODO + // TODO COMPLETELY BROKEN CODE -- THE GATK CURRENTLY ENCODES THE STOP POSITION FOR CLIPPED ALLELES AS + 1 + // TODO ITS TRUE SIZE TO DIFFERENTIATE CLIPPED VS. UNCLIPPED ALLELES. NEEDS TO BE FIXED + // TODO + // TODO + + final Byte refBaseForIndel = firstBaseIsClipped ? firstRefBase : null; + return new ClippedAlleles(stop, clippedAlleles, refBaseForIndel); + } + + /** + * Returns true if the alleles in inputVC should have reference bases added for padding + * + * We need to pad a VC with a common base if the length of the reference allele is + * less than the length of the VariantContext. This happens because the position of + * e.g. an indel is always one before the actual event (as per VCF convention). + * + * @param inputVC the VC to evaluate, cannot be null + * @return true if + */ + public static boolean needsPadding(final VariantContext inputVC) { + // biallelic sites with only symbolic never need padding + if ( inputVC.isBiallelic() && inputVC.getAlternateAllele(0).isSymbolic() ) + return false; + + final int recordLength = inputVC.getEnd() - inputVC.getStart() + 1; + final int referenceLength = inputVC.getReference().length(); + + if ( referenceLength == recordLength ) + return false; + else if ( referenceLength == recordLength - 1 ) + return true; + else if ( !inputVC.hasSymbolicAlleles() ) + throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); + else if ( inputVC.isMixed() && inputVC.hasSymbolicAlleles() ) + throw new IllegalArgumentException("GATK infrastructure limitation prevents needsPadding from working properly with VariantContexts containing a mixture of symbolic and concrete alleles at " + inputVC); + return false; + } + + public static Allele padAllele(final VariantContext vc, final Allele allele) { + assert needsPadding(vc); + + if ( allele.isSymbolic() ) + return allele; + else { + // get bases for current allele and create a new one with trimmed bases + final StringBuilder sb = new StringBuilder(); + sb.append((char)vc.getReferenceBaseForIndel().byteValue()); + sb.append(allele.getDisplayString()); + final String newBases = sb.toString(); + return Allele.create(newBases, allele.isReference()); + } + } + + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC) { + final boolean padVC = needsPadding(inputVC); + + // nothing to do if we don't need to pad bases + if ( padVC ) { + if ( !inputVC.hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); + + final ArrayList alleles = new ArrayList(inputVC.getNAlleles()); + final Map unpaddedToPadded = inputVC.hasGenotypes() ? new HashMap(inputVC.getNAlleles()) : null; + + boolean paddedAtLeastOne = false; + for (final Allele a : inputVC.getAlleles()) { + final Allele padded = padAllele(inputVC, a); + paddedAtLeastOne = paddedAtLeastOne || padded != a; + alleles.add(padded); + if ( unpaddedToPadded != null ) unpaddedToPadded.put(a, padded); // conditional to avoid making unnecessary make + } + + if ( ! paddedAtLeastOne ) + throw new ReviewedStingException("VC was supposed to need padding but no allele was actually changed at location " + inputVC.getChr() + ":" + inputVC.getStart() + " with allele " + inputVC.getAlleles()); + + final VariantContextBuilder vcb = new VariantContextBuilder(inputVC); + vcb.alleles(alleles); + + // the position of the inputVC is one further, if it doesn't contain symbolic alleles + vcb.computeEndFromAlleles(alleles, inputVC.getStart(), inputVC.getEnd()); + + if ( inputVC.hasGenotypes() ) { + assert unpaddedToPadded != null; + + // now we can recreate new genotypes with trimmed alleles + final GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); + for (final Genotype g : inputVC.getGenotypes() ) { + final List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); + for (final Allele a : g.getAlleles()) { + newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL); + } + genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make()); + } + vcb.genotypes(genotypes); + } + + return vcb.make(); + } + else + return inputVC; + + } + + public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { + // see if we need to trim common reference base from all alleles + + final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true); + if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) + return inputVC; + + final List alleles = new ArrayList(); + final GenotypesContext genotypes = GenotypesContext.create(); + final Map originalToTrimmedAlleleMap = new HashMap(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); + final Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + for ( final Genotype genotype : inputVC.getGenotypes() ) { + final List originalAlleles = genotype.getAlleles(); + final List trimmedAlleles = new ArrayList(); + for ( final Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); + } + + return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make(); + } + + @Invariant("stop != -1 || error != null") // we're either an error or a meaningful result but not both + public static class ClippedAlleles { + private final int stop; + private final List clippedAlleles; + private final Byte refBaseForIndel; + private final String error; + + @Requires({"stop > 0", "clippedAlleles != null"}) + private ClippedAlleles(final int stop, final List clippedAlleles, final Byte refBaseForIndel) { + this.stop = stop; + this.clippedAlleles = clippedAlleles; + this.error = null; + this.refBaseForIndel = refBaseForIndel; + } + + @Requires("error != null") + private ClippedAlleles(final String error) { + this.stop = -1; + this.clippedAlleles = null; + this.refBaseForIndel = null; + this.error = error; + } + + /** + * Get an error if it occurred + * @return the error message, or null if no error occurred + */ + public String getError() { + return error; + } + + /** + * Get the stop position to use after the clipping as been applied, given the + * provided position to clipAlleles + * @return + */ + public int getStop() { + return stop; + } + + /** + * Get the clipped alleles themselves + * @return the clipped alleles in the order of the input unclipped alleles + */ + public List getClippedAlleles() { + return clippedAlleles; + } + + /** + * Returns the reference base we should use for indels, or null if none is appropriate + * @return + */ + public Byte getRefBaseForIndel() { + return refBaseForIndel; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java index dcc141b00..40d22f46f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFStandardHeaderLines.java @@ -183,7 +183,7 @@ public class VCFStandardHeaderLines { registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter")); // INFO lines registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 93eb1b237..dc7bcd926 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; @@ -32,6 +34,7 @@ import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java index e1185ba70..d528bf0e4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java @@ -111,8 +111,9 @@ public final class FastGenotype extends Genotype { final int DP, final int[] AD, final int[] PL, + final String filters, final Map extendedAttributes) { - super(sampleName); + super(sampleName, filters); this.alleles = alleles; this.isPhased = isPhased; this.GQ = GQ; @@ -152,10 +153,6 @@ public final class FastGenotype extends Genotype { return GQ; } - @Override public List getFilters() { - return (List) getExtendedAttribute(VCFConstants.GENOTYPE_FILTER_KEY, Collections.emptyList()); - } - @Override public int[] getPL() { return PL; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index d268aabc6..fae0a7c4c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -27,6 +27,7 @@ public abstract class Genotype implements Comparable { * extended attributes map */ public final static Collection PRIMARY_KEYS = Arrays.asList( + VCFConstants.GENOTYPE_FILTER_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.DEPTH_KEY, @@ -38,14 +39,11 @@ public abstract class Genotype implements Comparable { private final String sampleName; private GenotypeType type = null; + private final String filters; - protected Genotype(final String sampleName) { + protected Genotype(final String sampleName, final String filters) { this.sampleName = sampleName; - } - - protected Genotype(final String sampleName, final GenotypeType type) { - this.sampleName = sampleName; - this.type = type; + this.filters = filters; } /** @@ -348,13 +346,14 @@ public abstract class Genotype implements Comparable { } public String toString() { - return String.format("[%s %s%s%s%s%s%s]", + return String.format("[%s %s%s%s%s%s%s%s]", getSampleName(), getGenotypeString(false), toStringIfExists(VCFConstants.GENOTYPE_QUALITY_KEY, getGQ()), toStringIfExists(VCFConstants.DEPTH_KEY, getDP()), toStringIfExists(VCFConstants.GENOTYPE_ALLELE_DEPTHS, getAD()), toStringIfExists(VCFConstants.GENOTYPE_PL_KEY, getPL()), + toStringIfExists(VCFConstants.GENOTYPE_FILTER_KEY, getFilters()), sortedString(getExtendedAttributes())); } @@ -448,15 +447,25 @@ public abstract class Genotype implements Comparable { } /** + * Returns the filter string associated with this Genotype. * - * @return + * @return If this result == null, then the genotype is considered PASSing filters + * If the result != null, then the genotype has failed filtering for the reason(s) + * specified in result. To be reference compliant multiple filter field + * string values can be encoded with a ; separator. */ - @Ensures({"result != null"}) - public abstract List getFilters(); + public final String getFilters() { + return filters; + } - @Ensures({"result != getFilters().isEmpty()"}) - public boolean isFiltered() { - return ! getFilters().isEmpty(); + /** + * Is this genotype filtered or not? + * + * @return returns false if getFilters() == null + */ + @Ensures({"result != (getFilters() == null)"}) + public final boolean isFiltered() { + return getFilters() != null; } @Deprecated public boolean hasLog10PError() { return hasGQ(); } @@ -569,6 +578,16 @@ public abstract class Genotype implements Comparable { return v == -1 ? "" : " " + name + " " + v; } + /** + * Returns a display name for field name with String value v if this isn't null. Otherwise returns "" + * @param name of the field ("FT") + * @param v the value of the field, or null if missing + * @return a non-null string for display if the field is not missing + */ + protected final static String toStringIfExists(final String name, final String v) { + return v == null ? "" : " " + name + " " + v; + } + /** * Returns a display name for field name with values vs if this isn't null. Otherwise returns "" * @param name of the field ("AD") diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index ac0c503f7..e3bef6bc5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; import com.google.java.contract.Requires; +import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import java.util.*; @@ -63,6 +64,7 @@ public final class GenotypeBuilder { private int[] AD = null; private int[] PL = null; private Map extendedAttributes = null; + private String filters = null; private int initialAttributeMapSize = 5; private boolean useFast = MAKE_FAST_BY_DEFAULT; @@ -147,6 +149,7 @@ public final class GenotypeBuilder { DP(g.getDP()); AD(g.getAD()); PL(g.getPL()); + filter(g.getFilters()); attributes(g.getExtendedAttributes()); return this; } @@ -164,6 +167,7 @@ public final class GenotypeBuilder { DP = -1; AD = null; PL = null; + filters = null; extendedAttributes = null; } @@ -180,21 +184,11 @@ public final class GenotypeBuilder { public Genotype make() { if ( useFast ) { final Map ea = extendedAttributes == null ? NO_ATTRIBUTES : extendedAttributes; - return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, ea); + return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, filters, ea); } else { final Map attributes = new LinkedHashMap(); if ( extendedAttributes != null ) attributes.putAll(extendedAttributes); final double log10PError = GQ == -1 ? SlowGenotype.NO_LOG10_PERROR : (GQ == 0 ? 0 : GQ / -10.0); - - Set filters = null; - if ( extendedAttributes != null && extendedAttributes.containsKey(VCFConstants.GENOTYPE_FILTER_KEY) ) - { - final Object f = extendedAttributes.get(VCFConstants.GENOTYPE_FILTER_KEY); - if ( f != null ) - filters = new LinkedHashSet((List)f); - attributes.remove(VCFConstants.GENOTYPE_FILTER_KEY); - } - if ( DP != -1 ) attributes.put(VCFConstants.DEPTH_KEY, DP); if ( AD != null ) attributes.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, AD); final double[] log10likelihoods = PL != null ? GenotypeLikelihoods.fromPLs(PL).getAsVector() : null; @@ -383,9 +377,12 @@ public final class GenotypeBuilder { */ @Requires("filters != null") public GenotypeBuilder filters(final List filters) { - if ( ! filters.isEmpty() ) - attribute(VCFConstants.GENOTYPE_FILTER_KEY, filters); - return this; + if ( filters.isEmpty() ) + return filter(null); + else if ( filters.size() == 1 ) + return filter(filters.get(0)); + else + return filter(ParsingUtils.join(";", ParsingUtils.sortList(filters))); } /** @@ -398,15 +395,24 @@ public final class GenotypeBuilder { return filters(Arrays.asList(filters)); } + /** + * Most efficient version of setting filters -- just set the filters string to filters + * + * @param filter if filters == null or filters.equals("PASS") => genotype is PASS + * @return + */ + public GenotypeBuilder filter(final String filter) { + this.filters = VCFConstants.PASSES_FILTERS_v4.equals(filter) ? null : filter; + return this; + } + /** * This genotype is unfiltered * * @return */ public GenotypeBuilder unfiltered() { - if ( extendedAttributes != null ) - extendedAttributes.remove(VCFConstants.GENOTYPE_FILTER_KEY); - return this; + return filter(null); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/SlowGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/SlowGenotype.java index 0dda82050..c3f027484 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/SlowGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/SlowGenotype.java @@ -42,14 +42,20 @@ public class SlowGenotype extends Genotype { protected List alleles = null; protected boolean isPhased = false; - protected SlowGenotype(String sampleName, List alleles, double log10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { - super(sampleName); + protected SlowGenotype(final String sampleName, + final List alleles, + final double log10PError, + final String filters, + final Map attributes, + final boolean isPhased, + final double[] log10Likelihoods) { + super(sampleName, filters); if ( alleles == null || alleles.isEmpty() ) this.alleles = Collections.emptyList(); else this.alleles = Collections.unmodifiableList(alleles); - commonInfo = new CommonInfo(sampleName, log10PError, filters, attributes); + commonInfo = new CommonInfo(sampleName, log10PError, Collections.emptySet(), attributes); if ( log10Likelihoods != null ) commonInfo.putAttribute(VCFConstants.GENOTYPE_PL_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods)); this.isPhased = isPhased; @@ -112,7 +118,6 @@ public class SlowGenotype extends Genotype { // get routines to access context info fields // // --------------------------------------------------------------------------------------------------------- - @Override public List getFilters() { return new ArrayList(commonInfo.getFilters()); } @Override public boolean hasLog10PError() { return commonInfo.hasLog10PError(); } @Override public double getLog10PError() { return commonInfo.getLog10PError(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index dc600d97c..1f0b2b054 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -558,8 +558,8 @@ public class VariantContext implements Feature { // to enable tribble integratio } public String getAlleleStringWithRefPadding(final Allele allele) { - if ( VariantContextUtils.needsPadding(this) ) - return VariantContextUtils.padAllele(this, allele).getDisplayString(); + if ( VCFAlleleClipper.needsPadding(this) ) + return VCFAlleleClipper.padAllele(this, allele).getDisplayString(); else return allele.getDisplayString(); } @@ -1126,6 +1126,7 @@ public class VariantContext implements Feature { // to enable tribble integratio // --------------------------------------------------------------------------------------------------------- private boolean validate(final EnumSet validationToPerform) { + validateStop(); for (final Validation val : validationToPerform ) { switch (val) { case ALLELES: validateAlleles(); break; @@ -1138,6 +1139,20 @@ public class VariantContext implements Feature { // to enable tribble integratio return true; } + /** + * Check that getEnd() == END from the info field, if it's present + */ + private void validateStop() { + if ( hasAttribute(VCFConstants.END_KEY) ) { + final int end = getAttributeAsInt(VCFConstants.END_KEY, -1); + assert end != -1; + if ( end != getEnd() ) + throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" + + getStart() + "; getEnd() was " + getEnd() + + " but this VariantContext contains an END key with value " + end); + } + } + private void validateReferencePadding() { if ( hasSymbolicAlleles() ) // symbolic alleles don't need padding... return; @@ -1176,7 +1191,7 @@ public class VariantContext implements Feature { // to enable tribble integratio // if ( getType() == Type.INDEL ) { // if ( getReference().length() != (getLocation().size()-1) ) { long length = (stop - start) + 1; - if ( ! isSymbolic() + if ( ! hasSymbolicAlleles() && ((getReference().isNull() && length != 1 ) || (getReference().isNonNull() && (length - getReference().length() > 1)))) { throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); @@ -1477,7 +1492,11 @@ public class VariantContext implements Feature { // to enable tribble integratio } public boolean hasSymbolicAlleles() { - for (final Allele a: getAlleles()) { + return hasSymbolicAlleles(getAlleles()); + } + + public static boolean hasSymbolicAlleles( final List alleles ) { + for ( final Allele a: alleles ) { if (a.isSymbolic()) { return true; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index 01d3ab456..0590329c4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -454,6 +454,32 @@ public class VariantContextBuilder { return this; } + /** + * @see #computeEndFromAlleles(java.util.List, int, int) with endForSymbolicAlleles == -1 + */ + public VariantContextBuilder computeEndFromAlleles(final List alleles, final int start) { + return computeEndFromAlleles(alleles, start, -1); + } + + /** + * Compute the end position for this VariantContext from the alleles themselves + * + * @see VariantContextUtils.computeEndFromAlleles() + * + * assigns this builder the stop position computed. + * + * @param alleles the list of alleles to consider. The reference allele must be the first one + * @param start the known start position of this event + * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 + * if no is expected but will throw an error if one is found + * @return this builder + */ + @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) + public VariantContextBuilder computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { + stop(VariantContextUtils.computeEndFromAlleles(alleles, start, endForSymbolicAlleles)); + return this; + } + /** * @return true if this builder contains fully decoded data * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index ccc0f5971..843838972 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -46,7 +46,9 @@ public class VariantContextUtils { public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); final public static JexlEngine engine = new JexlEngine(); public static final int DEFAULT_PLOIDY = 2; @@ -145,6 +147,10 @@ public class VariantContextUtils { attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + } else { + // if there's no alt AC and AF shouldn't be present + attributes.remove(VCFConstants.ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); } } @@ -184,83 +190,6 @@ public class VariantContextUtils { return g; } - /** - * Returns true if the alleles in inputVC should have reference bases added for padding - * - * We need to pad a VC with a common base if the length of the reference allele is - * less than the length of the VariantContext. This happens because the position of - * e.g. an indel is always one before the actual event (as per VCF convention). - * - * @param inputVC the VC to evaluate, cannot be null - * @return true if - */ - public static boolean needsPadding(final VariantContext inputVC) { - final int recordLength = inputVC.getEnd() - inputVC.getStart() + 1; - final int referenceLength = inputVC.getReference().length(); - - if ( referenceLength == recordLength ) - return false; - else if ( referenceLength == recordLength - 1 ) - return true; - else if ( !inputVC.hasSymbolicAlleles() ) - throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + - " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - else - return false; - } - - public static Allele padAllele(final VariantContext vc, final Allele allele) { - assert needsPadding(vc); - - if ( allele.isSymbolic() ) - return allele; - else { - // get bases for current allele and create a new one with trimmed bases - final StringBuilder sb = new StringBuilder(); - sb.append((char)vc.getReferenceBaseForIndel().byteValue()); - sb.append(allele.getDisplayString()); - final String newBases = sb.toString(); - return Allele.create(newBases, allele.isReference()); - } - } - - - public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC) { - final boolean padVC = needsPadding(inputVC); - - // nothing to do if we don't need to pad bases - if ( padVC ) { - if ( !inputVC.hasReferenceBaseForIndel() ) - throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); - - final ArrayList alleles = new ArrayList(inputVC.getNAlleles()); - final Map unpaddedToPadded = new HashMap(inputVC.getNAlleles()); - - for (final Allele a : inputVC.getAlleles()) { - final Allele padded = padAllele(inputVC, a); - alleles.add(padded); - unpaddedToPadded.put(a, padded); - } - - // now we can recreate new genotypes with trimmed alleles - GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples()); - for (final Genotype g : inputVC.getGenotypes() ) { - final List newGenotypeAlleles = new ArrayList(g.getAlleles().size()); - for (final Allele a : g.getAlleles()) { - newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL); - } - genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make()); - - } - - return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make(); - } - else - return inputVC; - - } - - private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) { VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field); if ( metaData == null ) metaData = header.getInfoHeaderLine(field); @@ -564,7 +493,7 @@ public class VariantContextUtils { for (final VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(createVariantContextWithPaddedAlleles(vc)); + VCs.add(VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc)); } if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -769,7 +698,7 @@ public class VariantContextUtils { return true; } - public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { + private static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; @@ -780,7 +709,7 @@ public class VariantContextUtils { else if (refAllele.isNull()) trimVC = false; else { - trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0); + trimVC = VCFAlleleClipper.shouldClipFirstBaseP(inputVC.getAlternateAlleles(), (byte) inputVC.getReference().getDisplayString().charAt(0)); } // nothing to do if we don't need to trim bases @@ -836,46 +765,6 @@ public class VariantContextUtils { return inputVC; } - public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - // see if we need to trim common reference base from all alleles - - final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1); - if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) - return inputVC; - - final List alleles = new ArrayList(); - final GenotypesContext genotypes = GenotypesContext.create(); - final Map originalToTrimmedAlleleMap = new HashMap(); - - for (final Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); - final Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // now we can recreate new genotypes with trimmed alleles - for ( final Genotype genotype : inputVC.getGenotypes() ) { - final List originalAlleles = genotype.getAlleles(); - final List trimmedAlleles = new ArrayList(); - for ( final Allele a : originalAlleles ) { - if ( a.isCalled() ) - trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); - else - trimmedAlleles.add(Allele.NO_CALL); - } - genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); - } - - return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make(); - } - public static GenotypesContext stripPLs(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); @@ -1485,4 +1374,33 @@ public class VariantContextUtils { return true; // we passed all tests, we matched } + + /** + * Compute the end position for this VariantContext from the alleles themselves + * + * In the trivial case this is a single BP event and end = start (open intervals) + * In general the end is start + ref length - 1, handling the case where ref length == 0 + * However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases + * + * @param alleles the list of alleles to consider. The reference allele must be the first one + * @param start the known start position of this event + * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 + * if no is expected but will throw an error if one is found + * @return this builder + */ + @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) + public static int computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { + final Allele ref = alleles.get(0); + + if ( ref.isNonReference() ) + throw new ReviewedStingException("computeEndFromAlleles requires first allele to be reference"); + + if ( VariantContext.hasSymbolicAlleles(alleles) ) { + if ( endForSymbolicAlleles == -1 ) + throw new ReviewedStingException("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided"); + return endForSymbolicAlleles; + } else { + return start + Math.max(ref.length() - 1, 0); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java index 42567adb8..5b81e7117 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; @@ -49,17 +50,22 @@ public abstract class BCF2FieldWriter { private final VCFHeader header; private final BCF2FieldEncoder fieldEncoder; + @Requires({"header != null", "fieldEncoder != null"}) protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { this.header = header; this.fieldEncoder = fieldEncoder; } + @Ensures("result != null") protected VCFHeader getHeader() { return header; } + @Ensures("result != null") protected BCF2FieldEncoder getFieldEncoder() { return fieldEncoder; } + @Ensures("result != null") protected String getField() { return getFieldEncoder().getField(); } + @Requires("vc != null") public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { fieldEncoder.writeFieldKey(encoder); } @@ -124,6 +130,9 @@ public abstract class BCF2FieldWriter { } @Override + @Requires({"encodingType != null", + "nValuesPerGenotype >= 0 || ! getFieldEncoder().hasConstantNumElements()"}) + @Ensures("nValuesPerGenotype >= 0") public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException { // writes the key information super.start(encoder, vc); @@ -141,15 +150,18 @@ public abstract class BCF2FieldWriter { encoder.encodeType(nValuesPerGenotype, encodingType); } + @Requires({"encodingType != null", "nValuesPerGenotype >= 0"}) public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { final Object fieldValue = g.getExtendedAttribute(getField(), null); getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); } + @Ensures({"result >= 0"}) protected int numElements(final VariantContext vc, final Genotype g) { return getFieldEncoder().numElements(vc, g.getExtendedAttribute(getField())); } + @Ensures({"result >= 0"}) private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) { int size = -1; @@ -227,6 +239,22 @@ public abstract class BCF2FieldWriter { } } + public static class FTGenotypesWriter extends StaticallyTypeGenotypesWriter { + public FTGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) { + super(header, fieldEncoder); + } + + public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException { + final String fieldValue = g.getFilters(); + getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype); + } + + @Override + protected int numElements(final VariantContext vc, final Genotype g) { + return getFieldEncoder().numElements(vc, g.getFilters()); + } + } + public static class GTWriter extends GenotypesWriter { final Map alleleMapForTriPlus = new HashMap(5); Allele ref, alt1; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java index 9f5bbc487..269750c66 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java @@ -140,6 +140,8 @@ public class BCF2FieldWriterManager { if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { return new BCF2FieldWriter.GTWriter(header, fieldEncoder); + } else if ( line.getID().equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { + return new BCF2FieldWriter.FTGenotypesWriter(header, fieldEncoder); } else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) { return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field)); } else if ( line.getType() == VCFHeaderLineType.Integer ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index 5555849dd..e281475f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -261,11 +261,13 @@ class BCF2Writer extends IndexingVariantContextWriter { } private void buildAlleles( VariantContext vc ) throws IOException { - final boolean needsPadding = VariantContextUtils.needsPadding(vc); + final boolean needsPadding = VCFAlleleClipper.needsPadding(vc); for ( Allele allele : vc.getAlleles() ) { if ( needsPadding ) - allele = VariantContextUtils.padAllele(vc,allele); + allele = VCFAlleleClipper.padAllele(vc, allele); final byte[] s = allele.getDisplayBases(); + if ( s == null ) + throw new ReviewedStingException("BUG: BCF2Writer encountered null padded allele" + allele); encoder.encodeTypedString(s); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index ee7b1b9ef..2a4c8cac2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -162,7 +162,7 @@ class VCFWriter extends IndexingVariantContextWriter { vc = new VariantContextBuilder(vc).noGenotypes().make(); try { - vc = VariantContextUtils.createVariantContextWithPaddedAlleles(vc); + vc = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc); super.add(vc); Map alleleMap = buildAlleleMap(vc); @@ -348,9 +348,8 @@ class VCFWriter extends IndexingVariantContextWriter { missingSampleError(vc, mHeader); } - List attrs = new ArrayList(genotypeFormatKeys.size()); + final List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String field : genotypeFormatKeys ) { - if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { if ( !g.isAvailable() ) { throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record"); @@ -363,54 +362,53 @@ class VCFWriter extends IndexingVariantContextWriter { } continue; - } - - String outputValue; - final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field); - if ( accessor != null ) { - final int[] intValues = accessor.getValues(g); - if ( intValues == null ) - outputValue = VCFConstants.MISSING_VALUE_v4; - else if ( intValues.length == 1 ) // fast path - outputValue = Integer.toString(intValues[0]); - else { - StringBuilder sb = new StringBuilder(); - sb.append(intValues[0]); - for ( int i = 1; i < intValues.length; i++) { - sb.append(","); - sb.append(intValues[i]); - } - outputValue = sb.toString(); - } } else { - Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; - - // some exceptions + String outputValue; if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) { - val = g.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())) : VCFConstants.PASSES_FILTERS_v4; - } - - VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); - if ( metaData != null ) { - int numInFormatField = metaData.getCount(vc); - if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { - // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. - // For example, if Number=2, the string has to be ".,." - StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); - for ( int i = 1; i < numInFormatField; i++ ) { - sb.append(","); - sb.append(VCFConstants.MISSING_VALUE_v4); + outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4; + } else { + final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field); + if ( accessor != null ) { + final int[] intValues = accessor.getValues(g); + if ( intValues == null ) + outputValue = VCFConstants.MISSING_VALUE_v4; + else if ( intValues.length == 1 ) // fast path + outputValue = Integer.toString(intValues[0]); + else { + StringBuilder sb = new StringBuilder(); + sb.append(intValues[0]); + for ( int i = 1; i < intValues.length; i++) { + sb.append(","); + sb.append(intValues[i]); + } + outputValue = sb.toString(); } - val = sb.toString(); + } else { + Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4; + + VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field); + if ( metaData != null ) { + int numInFormatField = metaData.getCount(vc); + if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) { + // If we have a missing field but multiple values are expected, we need to construct a new string with all fields. + // For example, if Number=2, the string has to be ".,." + StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4); + for ( int i = 1; i < numInFormatField; i++ ) { + sb.append(","); + sb.append(VCFConstants.MISSING_VALUE_v4); + } + val = sb.toString(); + } + } + + // assume that if key is absent, then the given string encoding suffices + outputValue = formatVCFField(val); } } - // assume that if key is absent, then the given string encoding suffices - outputValue = formatVCFField(val); + if ( outputValue != null ) + attrs.add(outputValue); } - - if ( outputValue != null ) - attrs.add(outputValue); } // strip off trailing missing values @@ -524,7 +522,7 @@ class VCFWriter extends IndexingVariantContextWriter { if ( g.hasDP() ) sawDP = true; if ( g.hasAD() ) sawAD = true; if ( g.hasPL() ) sawPL = true; - if (g.isFiltered() && g.isCalled()) sawGenotypeFilter = true; + if (g.isFiltered()) sawGenotypeFilter = true; } if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY); diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 55de62431..86b7e60ff 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -15,10 +15,7 @@ import org.testng.SkipException; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @@ -296,6 +293,12 @@ public abstract class BaseTest { assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); } + public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { + final Set actualSet = new HashSet(actual); + final Set expectedSet = new HashSet(expected); + Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps + } + public static final void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately Assert.assertTrue(Double.isNaN(actual)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java index 13519c815..2af88b109 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java @@ -18,22 +18,21 @@ public class SymbolicAllelesIntegrationTest extends WalkerTest { " --no_cmdline_in_header"; } - - @Test(enabled = false) + @Test(enabled = true) public void test1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(b36KGReference, "symbolic_alleles_1.vcf"), 1, - Arrays.asList("c79137da24ad4dc15cedc742de39247f")); + Arrays.asList("5bafc5a99ea839e686e55de93f91fd5c")); executeTest("Test symbolic alleles", spec); } - @Test(enabled = false) + @Test(enabled = true) public void test2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(b36KGReference, "symbolic_alleles_2.vcf"), 1, - Arrays.asList("3f6cbbd5fdf164d87081a3af19eeeba7")); + Arrays.asList("bf5a09f783ab1fa44774c81f91d10921")); executeTest("Test symbolic alleles mixed in with non-symbolic alleles", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 8fe96b53d..869011a99 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -75,7 +75,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("fbbbebfda35bab3f6f62eea2f0be1c01")); + "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("d8906b67c7f9fdb5b37b8e9e050982d3")); spec.disableShadowBCF(); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index 63b2d39f1..55b04f1b8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -38,17 +38,17 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { private void DTTest(String testName, String args, String md5) { String base = String.format("-T DiagnoseTargets --no_cmdline_in_header -R %s -L %s", REF, L) + " -o %s "; WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5)); - spec.disableShadowBCF(); + //spec.disableShadowBCF(); executeTest(testName, spec); } @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "ef71a569a48697c89e642cdda7bfb766"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9954b21163d3e66db232938ec509067f"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "1e6e15156e01e736274898fdac77d911"); + DTTest("testMultiSample ", "-I " + multiSample, "7c5277261e8e9dd74666f04843ffb09c"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 70a10a0b5..ae5128c75 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -80,7 +80,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("c5ed9dd3975b3602293bb484b4fda5f4")); + Arrays.asList("060e9e7b6faf8b2f7b3291594eb6b39c")); executeTest("test genotype filter #1", spec1); } @@ -88,7 +88,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { public void testGenotypeFilters2() { WalkerTestSpec spec2 = new WalkerTestSpec( baseTestString() + " -G_filter 'isHomVar == 1' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, - Arrays.asList("979ccdf484259117aa31305701075602")); + Arrays.asList("00f90028a8c0d56772c47f039816b585")); executeTest("test genotype filter #2", spec2); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index e0cda07d7..c075bb390 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -50,7 +50,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst(); } @@ -99,7 +98,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile %s" + " -tranchesFile %s", Arrays.asList(params.recalMD5, params.tranchesMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); } @@ -116,7 +114,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); - spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles + spec.disableShadowBCF(); // has to be disabled because the input VCF is missing LowQual annotation executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java new file mode 100644 index 000000000..8cd051e01 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFAlleleClipperUnitTest.java @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.codecs.vcf; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.*; +import org.testng.Assert; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class VCFAlleleClipperUnitTest extends BaseTest { + // -------------------------------------------------------------------------------- + // + // Test allele clipping + // + // -------------------------------------------------------------------------------- + + private class ClipAllelesTest extends TestDataProvider { + final int position; + final int stop; + final String ref; + List inputs; + List expected; + + @Requires("arg.length % 2 == 0") + private ClipAllelesTest(final int position, final int stop, final String ... arg) { + super(ClipAllelesTest.class); + this.position = position; + this.stop = stop; + this.ref = arg[0]; + + int n = arg.length / 2; + inputs = new ArrayList(n); + expected = new ArrayList(n); + + for ( int i = 0; i < n; i++ ) { + final boolean ref = i % n == 0; + inputs.add(Allele.create(arg[i], ref)); + } + for ( int i = n; i < arg.length; i++ ) { + final boolean ref = i % n == 0; + expected.add(Allele.create(arg[i], ref)); + } + } + + public boolean isClipped() { + for ( int i = 0; i < inputs.size(); i++ ) { + if ( inputs.get(i).length() != expected.get(i).length() ) + return true; + } + + return false; + } + + public String toString() { + return String.format("ClipAllelesTest input=%s expected=%s", inputs, expected); + } + } + @DataProvider(name = "ClipAllelesTest") + public Object[][] makeClipAllelesTest() { + // do no harm + new ClipAllelesTest(10, 10, "A", "A"); + new ClipAllelesTest(10, 10, "A", "C", "A", "C"); + new ClipAllelesTest(10, 10, "A", "C", "G", "A", "C", "G"); + + // insertions + new ClipAllelesTest(10, 10, "A", "AA", "-", "A"); + new ClipAllelesTest(10, 10, "A", "AAA", "-", "AA"); + new ClipAllelesTest(10, 10, "A", "AG", "-", "G"); + + // deletions + new ClipAllelesTest(10, 11, "AA", "A", "A", "-"); + new ClipAllelesTest(10, 12, "AAA", "A", "AA", "-"); + new ClipAllelesTest(10, 11, "AG", "A", "G", "-"); + new ClipAllelesTest(10, 12, "AGG", "A", "GG", "-"); + + // multi-allelic insertion and deletions + new ClipAllelesTest(10, 11, "AA", "A", "AAA", "A", "-", "AA"); + new ClipAllelesTest(10, 11, "AA", "A", "AAG", "A", "-", "AG"); + new ClipAllelesTest(10, 10, "A", "AA", "AAA", "-", "A", "AA"); + new ClipAllelesTest(10, 10, "A", "AA", "ACA", "-", "A", "CA"); + new ClipAllelesTest(10, 12, "ACG", "ATC", "AGG", "CG", "TC", "GG"); + new ClipAllelesTest(10, 11, "AC", "AT", "AG", "C", "T", "G"); + + // cannot be clipped + new ClipAllelesTest(10, 11, "AC", "CT", "AG", "AC", "CT", "AG"); + new ClipAllelesTest(10, 11, "AC", "CT", "GG", "AC", "CT", "GG"); + + // symbolic + new ClipAllelesTest(10, 100, "A", "", "A", ""); + new ClipAllelesTest(50, 50, "G", "G]22:60]", "G", "G]22:60]"); + new ClipAllelesTest(51, 51, "T", "]22:55]T", "T", "]22:55]T"); + new ClipAllelesTest(52, 52, "C", "C[22:51[", "C", "C[22:51["); + new ClipAllelesTest(60, 60, "A", "A]22:50]", "A", "A]22:50]"); + + // symbolic with alleles that should be clipped + new ClipAllelesTest(10, 100, "A", "", "AA", "-", "", "A"); + new ClipAllelesTest(10, 100, "AA", "", "A", "A", "", "-"); + new ClipAllelesTest(10, 100, "AA", "", "A", "AAA", "A", "", "-", "AA"); + new ClipAllelesTest(10, 100, "AG", "", "A", "AGA", "G", "", "-", "GA"); + new ClipAllelesTest(10, 100, "G", "", "A", "G", "", "A"); + + // clipping from both ends + // + // TODO -- THIS CODE IS BROKEN BECAUSE CLIPPING DOES WORK WITH ALLELES CLIPPED FROM THE END + // +// new ClipAllelesTest(10, 10, "ATA", "ATTA", "-", "T"); +// new ClipAllelesTest(10, 10, "ATAA", "ATTAA", "-", "T"); +// new ClipAllelesTest(10, 10, "ATAAG", "ATTAAG", "-", "T"); +// new ClipAllelesTest(10, 11, "GTA", "ATTA", "G", "AT"); +// new ClipAllelesTest(10, 11, "GTAA", "ATTAA", "G", "AT"); +// new ClipAllelesTest(10, 11, "GTAAG", "ATTAAG", "G", "AT"); + + // complex substitutions + new ClipAllelesTest(10, 10, "A", "GA", "A", "GA"); + + return ClipAllelesTest.getTests(ClipAllelesTest.class); + } + + @Test(dataProvider = "ClipAllelesTest") + public void testClipAllelesTest(ClipAllelesTest cfg) { + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs, cfg.stop); + Assert.assertNull(clipped.getError(), "Unexpected error occurred"); + Assert.assertEquals(clipped.getStop(), cfg.stop, "Clipped alleles stop"); + Assert.assertEquals(clipped.getClippedAlleles(), cfg.expected, "Clipped alleles"); + } + + @Test(dataProvider = "ClipAllelesTest", dependsOnMethods = "testClipAllelesTest") + public void testPaddingAllelesInVC(final ClipAllelesTest cfg) { + final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs, cfg.stop); + final VariantContext vc = new VariantContextBuilder("x", "1", cfg.position, cfg.stop, clipped.getClippedAlleles()) + .referenceBaseForIndel(clipped.getRefBaseForIndel()).make(); + + if ( vc.isMixed() && vc.hasSymbolicAlleles() ) + throw new SkipException("GATK cannot handle mixed variant contexts with symbolic and concrete alleles. Remove this check when allele clipping and padding is generalized"); + + Assert.assertEquals(VCFAlleleClipper.needsPadding(vc), cfg.isClipped(), "needPadding method"); + + if ( cfg.isClipped() ) { + // TODO + // TODO note that the GATK currently uses a broken approach to the clipped alleles, so the expected stop is + // TODO actually the original stop, as the original stop is +1 its true size. + // TODO + final int expectedStop = vc.getEnd(); // + (vc.hasSymbolicAlleles() ? 0 : 1); + + final VariantContext padded = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc); + Assert.assertEquals(padded.getStart(), vc.getStart(), "padded VC start"); + Assert.assertEquals(padded.getAlleles(), cfg.inputs, "padded VC alleles == original unclipped alleles"); + Assert.assertEquals(padded.getEnd(), expectedStop, "padded VC end should be clipped VC + 1 (added a base to ref allele)"); + Assert.assertFalse(VCFAlleleClipper.needsPadding(padded), "padded VC shouldn't need padding again"); + } + } + + // -------------------------------------------------------------------------------- + // + // basic allele clipping test + // + // -------------------------------------------------------------------------------- + + private class ReverseClippingPositionTestProvider extends TestDataProvider { + final String ref; + final List alleles = new ArrayList(); + final int expectedClip; + + private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) { + super(ReverseClippingPositionTestProvider.class); + this.ref = ref; + for ( final String allele : alleles ) + this.alleles.add(Allele.create(allele)); + this.expectedClip = expectedClip; + } + + @Override + public String toString() { + return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); + } + } + + @DataProvider(name = "ReverseClippingPositionTestProvider") + public Object[][] makeReverseClippingPositionTestProvider() { + // pair clipping + new ReverseClippingPositionTestProvider(0, "ATT", "CCG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CCT"); + new ReverseClippingPositionTestProvider(2, "ATT", "CTT"); + new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele + + // triplets + new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go + new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go + + return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); + } + + + @Test(dataProvider = "ReverseClippingPositionTestProvider") + public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { + int result = VCFAlleleClipper.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false); + Assert.assertEquals(result, cfg.expectedClip); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java deleted file mode 100644 index e0fb1b876..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -// our package -package org.broadinstitute.sting.utils.codecs.vcf; - - -// the imports for unit testing. - - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.variantcontext.*; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - - -public class VCFCodecUnitTest extends BaseTest { - - // -------------------------------------------------------------------------------- - // - // Provider - // - // -------------------------------------------------------------------------------- - - private class AlleleClippingTestProvider extends TestDataProvider { - final String ref; - final List alleles = new ArrayList(); - final int expectedClip; - - private AlleleClippingTestProvider(final int expectedClip, final String ref, final String ... alleles) { - super(AlleleClippingTestProvider.class); - this.ref = ref; - for ( final String allele : alleles ) - this.alleles.add(Allele.create(allele)); - this.expectedClip = expectedClip; - } - - @Override - public String toString() { - return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); - } - } - - @DataProvider(name = "AlleleClippingTestProvider") - public Object[][] MakeAlleleClippingTest() { - // pair clipping - new AlleleClippingTestProvider(0, "ATT", "CCG"); - new AlleleClippingTestProvider(1, "ATT", "CCT"); - new AlleleClippingTestProvider(2, "ATT", "CTT"); - new AlleleClippingTestProvider(2, "ATT", "ATT"); // cannot completely clip allele - - // triplets - new AlleleClippingTestProvider(0, "ATT", "CTT", "CGG"); - new AlleleClippingTestProvider(1, "ATT", "CTT", "CGT"); // the T can go - new AlleleClippingTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go - - return AlleleClippingTestProvider.getTests(AlleleClippingTestProvider.class); - } - - - @Test(dataProvider = "AlleleClippingTestProvider") - public void TestAlleleClipping(AlleleClippingTestProvider cfg) { - int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1); - Assert.assertEquals(result, cfg.expectedClip); - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 2a92b85e1..4d23f0fa5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -26,7 +26,7 @@ public class VCFIntegrationTest extends WalkerTest { executeTest("Test Variants To VCF from new output", spec2); } - @Test(enabled = false) + @Test(enabled = true) // See https://getsatisfaction.com/gsa/topics/support_vcf_4_1_structural_variation_breakend_alleles?utm_content=topic_link&utm_medium=email&utm_source=new_topic public void testReadingAndWritingBreakpointAlleles() { String testVCF = privateTestDir + "breakpoint-example.vcf"; diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java index 36b2ade53..e511a7f30 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeUnitTest.java @@ -48,6 +48,26 @@ public class GenotypeUnitTest extends BaseTest { T = Allele.create("T"); } + private static final GenotypeBuilder makeGB() { + return new GenotypeBuilder("misc"); + } + + @Test + public void testFilters() { + Assert.assertFalse(makeGB().make().isFiltered(), "by default Genotypes must be PASS"); + Assert.assertNull(makeGB().make().getFilters(), "by default Genotypes must be PASS => getFilters() == null"); + Assert.assertFalse(makeGB().filter(null).make().isFiltered(), "setting filter == null => Genotypes must be PASS"); + Assert.assertNull(makeGB().filter(null).make().getFilters(), "Genotypes PASS => getFilters == null"); + Assert.assertFalse(makeGB().filter("PASS").make().isFiltered(), "setting filter == PASS => Genotypes must be PASS"); + Assert.assertNull(makeGB().filter("PASS").make().getFilters(), "Genotypes PASS => getFilters == null"); + Assert.assertTrue(makeGB().filter("x").make().isFiltered(), "setting filter != null => Genotypes must be PASS"); + Assert.assertEquals(makeGB().filter("x").make().getFilters(), "x", "Should get back the expected filter string"); + Assert.assertEquals(makeGB().filters("x", "y").make().getFilters(), "x;y", "Multiple filter field values should be joined with ;"); + Assert.assertEquals(makeGB().filters("x", "y", "z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;"); + Assert.assertTrue(makeGB().filters("x", "y", "z").make().isFiltered(), "Multiple filter values should be filtered"); + Assert.assertEquals(makeGB().filter("x;y;z").make().getFilters(), "x;y;z", "Multiple filter field values should be joined with ;"); + } + // public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { // public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { // public Genotype(String sampleName, List alleles, double negLog10PError, double[] log10Likelihoods) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 528f3dd29..94ed2ce5f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -56,7 +56,7 @@ public class VariantContextTestProvider { final private static boolean ENABLE_VARARRAY_TESTS = true; final private static boolean ENABLE_PLOIDY_TESTS = true; final private static boolean ENABLE_PL_TESTS = true; - final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = false; + final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = true; final private static boolean ENABLE_SOURCE_VCF_TESTS = true; final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true; final private static List TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); @@ -70,8 +70,10 @@ public class VariantContextTestProvider { testSourceVCFs.add(new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf")); testSourceVCFs.add(new File(BaseTest.privateTestDir + "ex2.vcf")); testSourceVCFs.add(new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf")); - if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) + if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) { testSourceVCFs.add(new File(BaseTest.privateTestDir + "diagnosis_targets_testfile.vcf")); + testSourceVCFs.add(new File(BaseTest.privateTestDir + "VQSR.mixedTest.recal")); + } } public abstract static class VariantContextIOTest { @@ -181,6 +183,7 @@ public class VariantContextTestProvider { Set metaData = new TreeSet(); addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String); + addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer); addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String); addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String); addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); @@ -191,7 +194,7 @@ public class VariantContextTestProvider { addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer); addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String); addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); - addHeaderLine(metaData, "FT", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String); + addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String); // prep the header metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0)); @@ -283,6 +286,15 @@ public class VariantContextTestProvider { if ( ENABLE_A_AND_G_TESTS ) addGenotypesAndGTests(); + + if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) + addSymbolicAlleleTests(); + } + + private static void addSymbolicAlleleTests() { + // two tests to ensure that the end is computed correctly when there's (and not) an END field present + add(builder().alleles("N", "").start(10).stop(11).attribute("END", 11)); + add(builder().alleles("N", "").start(10).stop(10)); } private static void addGenotypesToTestData() { @@ -509,27 +521,26 @@ public class VariantContextTestProvider { addGenotypeTests(site, // missing value in varlist of string attr("g1", ref, "FLOAT1", 1.0), attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5"))); - - - // - // - // TESTING GENOTYPE FILTERS - // - // - addGenotypeTests(site, - new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); - addGenotypeTests(site, - new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), - new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(), - new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); } + + // + // + // TESTING GENOTYPE FILTERS + // + // + addGenotypeTests(site, + new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(), + new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); + addGenotypeTests(site, + new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(), + new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(), + new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make()); } private static void addGenotypesAndGTests() { @@ -711,14 +722,12 @@ public class VariantContextTestProvider { Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles"); assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); - Assert.assertEquals(actual.getFilters(), expected.getFilters(), "filters"); + BaseTest.assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); BaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); if ( expected.hasGenotypes() ) { - final Set actualSampleSet = new HashSet(actual.getSampleNames()); - final Set expectedSampleSet = new HashSet(expected.getSampleNames()); - Assert.assertTrue(actualSampleSet.equals(expectedSampleSet), "sample names"); // note this is necessary due to testng bug for set comps + BaseTest.assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); final Set samples = expected.getSampleNames(); for ( final String sample : samples ) {