BCF2 with support symbolic alleles
-- refactored allele clipping / padding code into VCFAlleleClipping class, and added much needed docs and TODOs for methods dev guys -- Added real unit tests for (some) clipping operations in VCFUtilsUnitTest
This commit is contained in:
parent
86feea917e
commit
16276f81a1
|
|
@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
|||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFAlleleClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -493,7 +494,7 @@ public class UnifiedGenotyperEngine {
|
|||
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
||||
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
||||
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed
|
||||
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
|
||||
vcCall = VCFAlleleClipper.reverseTrimAlleles(vcCall);
|
||||
|
||||
if ( annotationEngine != null && !limitedContext ) {
|
||||
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
||||
|
|
|
|||
|
|
@ -129,7 +129,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
.attribute("OriginalStart", fromInterval.getStart()).make();
|
||||
}
|
||||
|
||||
VariantContext newVC = VariantContextUtils.createVariantContextWithPaddedAlleles(vc);
|
||||
VariantContext newVC = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc);
|
||||
if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) {
|
||||
logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s",
|
||||
originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(),
|
||||
|
|
|
|||
|
|
@ -51,6 +51,8 @@ import java.util.*;
|
|||
*/
|
||||
public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
|
||||
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
|
||||
private final static boolean FORBID_SYMBOLICS = false;
|
||||
|
||||
private VCFHeader header = null;
|
||||
|
||||
/**
|
||||
|
|
@ -270,7 +272,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
|
|||
" samples in header but have a record with " + nSamples + " samples");
|
||||
|
||||
decodeID(builder);
|
||||
final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
|
||||
final List<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
|
||||
decodeFilter(builder);
|
||||
decodeInfo(builder, nInfo);
|
||||
|
||||
|
|
@ -283,9 +285,9 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
|
|||
protected final static class SitesInfoForDecoding {
|
||||
final int nFormatFields;
|
||||
final int nSamples;
|
||||
final ArrayList<Allele> alleles;
|
||||
final List<Allele> alleles;
|
||||
|
||||
private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
|
||||
private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final List<Allele> alleles) {
|
||||
this.nFormatFields = nFormatFields;
|
||||
this.nSamples = nSamples;
|
||||
this.alleles = alleles;
|
||||
|
|
@ -325,13 +327,14 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
|
|||
* @param unclippedAlleles
|
||||
* @return
|
||||
*/
|
||||
protected static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
|
||||
if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
|
||||
final ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
|
||||
AbstractVCFCodec.clipAlleles(position, ref, unclippedAlleles, clippedAlleles, -1);
|
||||
return clippedAlleles;
|
||||
} else
|
||||
return unclippedAlleles;
|
||||
@Requires({"position > 0", "ref != null && ref.length() > 0", "! unclippedAlleles.isEmpty()"})
|
||||
@Ensures("result.size() == unclippedAlleles.size()")
|
||||
protected List<Allele> clipAllelesIfNecessary(final int position,
|
||||
final String ref,
|
||||
final List<Allele> unclippedAlleles) {
|
||||
final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(position, ref, unclippedAlleles);
|
||||
if ( clipped.getError() != null ) error(clipped.getError());
|
||||
return clipped.getClippedAlleles();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -342,9 +345,9 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
|
|||
* @return the alleles
|
||||
*/
|
||||
@Requires("nAlleles > 0")
|
||||
private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
|
||||
private List<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
|
||||
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles);
|
||||
List<Allele> alleles = new ArrayList<Allele>(nAlleles);
|
||||
String ref = null;
|
||||
|
||||
for ( int i = 0; i < nAlleles; i++ ) {
|
||||
|
|
@ -356,7 +359,7 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
|
|||
|
||||
alleles.add(allele);
|
||||
|
||||
if ( allele.isSymbolic() )
|
||||
if ( FORBID_SYMBOLICS && allele.isSymbolic() )
|
||||
throw new ReviewedStingException("LIMITATION: GATK BCF2 codec does not yet support symbolic alleles");
|
||||
}
|
||||
assert ref != null;
|
||||
|
|
|
|||
|
|
@ -44,13 +44,13 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
|
|||
// initialized when this lazy decoder is created, as we know all of this from the BCF2Codec
|
||||
// and its stored here again for code cleanliness
|
||||
private final BCF2Codec codec;
|
||||
private final ArrayList<Allele> siteAlleles;
|
||||
private final List<Allele> siteAlleles;
|
||||
private final int nSamples;
|
||||
private final int nFields;
|
||||
private final GenotypeBuilder[] builders;
|
||||
|
||||
@Requires("codec.getHeader().getNGenotypeSamples() == builders.length")
|
||||
BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples,
|
||||
BCF2LazyGenotypesDecoder(final BCF2Codec codec, final List<Allele> alleles, final int nSamples,
|
||||
final int nFields, final GenotypeBuilder[] builders) {
|
||||
this.codec = codec;
|
||||
this.siteAlleles = alleles;
|
||||
|
|
|
|||
|
|
@ -229,10 +229,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
|
|||
} catch (Exception e) {
|
||||
throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo);
|
||||
}
|
||||
}
|
||||
// handle multi-positional events
|
||||
else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||
stop = clipAlleles(start, ref, alleles, null, lineNo);
|
||||
} else {
|
||||
stop = VCFAlleleClipper.clipAlleles(start, ref, alleles).stop;
|
||||
}
|
||||
|
||||
return new VCFLocFeature(locParts[0], start, stop);
|
||||
|
|
@ -342,10 +340,12 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
|
|||
} catch (Exception e) {
|
||||
generateException("the END value in the INFO field is not valid");
|
||||
}
|
||||
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
|
||||
stop = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
|
||||
alleles = newAlleles;
|
||||
} else {
|
||||
final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(pos, ref, alleles);
|
||||
if ( clipped.getError() != null )
|
||||
generateException(clipped.getError(), lineNo);
|
||||
stop = clipped.getStop();
|
||||
alleles = clipped.clippedAlleles;
|
||||
}
|
||||
builder.stop(stop);
|
||||
builder.alleles(alleles);
|
||||
|
|
@ -613,102 +613,6 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
|
|||
alleles.add(allele);
|
||||
}
|
||||
|
||||
public static boolean isSingleNucleotideEvent(List<Allele> alleles) {
|
||||
for ( Allele a : alleles ) {
|
||||
if ( a.length() != 1 )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static int computeForwardClipping(final List<Allele> unclippedAlleles, final byte ref0) {
|
||||
boolean clipping = true;
|
||||
int symbolicAlleleCount = 0;
|
||||
|
||||
for ( Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() ) {
|
||||
symbolicAlleleCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( a.length() < 1 || (a.getBases()[0] != ref0) ) {
|
||||
clipping = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// don't clip if all alleles are symbolic
|
||||
return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0;
|
||||
}
|
||||
|
||||
public static int computeReverseClipping(final List<Allele> unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) {
|
||||
int clipping = 0;
|
||||
boolean stillClipping = true;
|
||||
|
||||
while ( stillClipping ) {
|
||||
for ( final Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() )
|
||||
continue;
|
||||
|
||||
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
|
||||
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
|
||||
if ( a.length() - clipping == 0 )
|
||||
return clipping - (allowFullClip ? 0 : 1);
|
||||
|
||||
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) {
|
||||
stillClipping = false;
|
||||
}
|
||||
else if ( ref.length == clipping ) {
|
||||
if ( allowFullClip )
|
||||
stillClipping = false;
|
||||
else
|
||||
generateException("bad alleles encountered", lineNo);
|
||||
}
|
||||
else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) {
|
||||
stillClipping = false;
|
||||
}
|
||||
}
|
||||
if ( stillClipping )
|
||||
clipping++;
|
||||
}
|
||||
|
||||
return clipping;
|
||||
}
|
||||
|
||||
/**
|
||||
* clip the alleles, based on the reference
|
||||
*
|
||||
* @param position the unadjusted start position (pre-clipping)
|
||||
* @param ref the reference string
|
||||
* @param unclippedAlleles the list of unclipped alleles
|
||||
* @param clippedAlleles output list of clipped alleles
|
||||
* @param lineNo the current line number in the file
|
||||
* @return the new reference end position of this event
|
||||
*/
|
||||
public static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
||||
|
||||
int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0));
|
||||
int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo);
|
||||
|
||||
if ( clippedAlleles != null ) {
|
||||
for ( Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() ) {
|
||||
clippedAlleles.add(a);
|
||||
} else {
|
||||
final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping);
|
||||
if ( !Allele.acceptableAlleleBases(allele) )
|
||||
generateException("Unparsable vcf record with bad allele [" + allele + "]", lineNo);
|
||||
clippedAlleles.add(Allele.create(allele, a.isReference()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the new reference length
|
||||
int refLength = ref.length() - reverseClipping;
|
||||
|
||||
return position+Math.max(refLength - 1,0);
|
||||
}
|
||||
|
||||
public final static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) {
|
||||
try {
|
||||
return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,369 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* All of the gross allele clipping and padding routines in one place
|
||||
*
|
||||
* Having attempted to understand / fix / document this code myself
|
||||
* I can only conclude that this entire approach needs to be rethought. This
|
||||
* code just doesn't work robustly with symbolic alleles, with multiple alleles,
|
||||
* requires a special "reference base for indels" stored in the VariantContext
|
||||
* whose correctness isn't enforced, and overall has strange special cases
|
||||
* all over the place.
|
||||
*
|
||||
* The reason this code is so complex is due to symbolics and multi-alleleic
|
||||
* variation, which frequently occur when combining variants from multiple
|
||||
* VCF files.
|
||||
*
|
||||
* TODO rethink this class, make it clean, and make it easy to create, mix, and write out alleles
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 6/12
|
||||
*/
|
||||
public final class VCFAlleleClipper {
|
||||
// TODO
|
||||
// TODO
|
||||
// TODO
|
||||
// TODO I have honestly no idea what this code is really supposed to do
|
||||
// TODO computeForwardClipping is called in two places:
|
||||
// TODO
|
||||
// TODO clipAlleles() where all alleles, including ref, are passed in
|
||||
// TODO createVariantContextWithTrimmedAlleles() where only the alt alleles are passed in
|
||||
// TODO
|
||||
// TODO The problem is that the code allows us to clip
|
||||
// TODO
|
||||
// TODO
|
||||
// TODO
|
||||
// TODO
|
||||
// TODO
|
||||
// TODO
|
||||
@Ensures("result == 0 || result == 1")
|
||||
public static int computeForwardClipping(final List<Allele> unclippedAlleles,
|
||||
final byte ref0) {
|
||||
boolean clipping = true;
|
||||
int symbolicAlleleCount = 0;
|
||||
|
||||
for ( final Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() ) {
|
||||
symbolicAlleleCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( a.length() < 1 || (a.getBases()[0] != ref0) ) {
|
||||
clipping = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// don't clip if all alleles are symbolic
|
||||
return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0;
|
||||
}
|
||||
|
||||
public static int computeReverseClipping(final List<Allele> unclippedAlleles,
|
||||
final byte[] ref,
|
||||
final int forwardClipping,
|
||||
final boolean allowFullClip) {
|
||||
int clipping = 0;
|
||||
boolean stillClipping = true;
|
||||
|
||||
while ( stillClipping ) {
|
||||
for ( final Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() )
|
||||
continue;
|
||||
|
||||
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
|
||||
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
|
||||
if ( a.length() - clipping == 0 )
|
||||
return clipping - (allowFullClip ? 0 : 1);
|
||||
|
||||
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) {
|
||||
stillClipping = false;
|
||||
}
|
||||
else if ( ref.length == clipping ) {
|
||||
if ( allowFullClip )
|
||||
stillClipping = false;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) {
|
||||
stillClipping = false;
|
||||
}
|
||||
}
|
||||
if ( stillClipping )
|
||||
clipping++;
|
||||
}
|
||||
|
||||
return clipping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are the alleles describing a polymorphism substitution one base for another?
|
||||
*
|
||||
* @param alleles a list of alleles, must not be empty
|
||||
* @return Return true if the length of any allele in alleles isn't 1
|
||||
*/
|
||||
@Requires("!alleles.isEmpty()")
|
||||
private static boolean isSingleNucleotideEvent(final List<Allele> alleles) {
|
||||
for ( final Allele a : alleles ) {
|
||||
if ( a.length() != 1 )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* clip the alleles, based on the reference, returning a ClippedAlleles object describing what happened
|
||||
*
|
||||
* The ClippedAlleles object contains the implied stop position of the alleles, given the provided start
|
||||
* position, after clipping. It also contains the list of alleles, in the same order as the provided
|
||||
* unclipped ones, that are the fully clipped version of the input alleles. If an error occurs
|
||||
* during this option the getError() function returns a string describing the problem (for use in parsers).
|
||||
*
|
||||
* The basic operation are:
|
||||
*
|
||||
* single allele
|
||||
* => stop == start and clipped == unclipped
|
||||
* any number of single nucleotide events
|
||||
* => stop == start and clipped == unclipped
|
||||
* two alleles, second being symbolic
|
||||
* => stop == start and clipped == unclipped
|
||||
* Note in this case that the STOP should be computed by other means (from END in VCF, for example)
|
||||
* Note that if there's more than two alleles and the second is a symbolic the code produces an error
|
||||
* Any other case:
|
||||
* The alleles are trimmed of any sequence shared at the end of the alleles. If N bases
|
||||
* are common then the alleles will all be at least N bases shorter.
|
||||
* The stop position returned is the start position + the length of the
|
||||
* reverse trimmed only reference allele - 1.
|
||||
* If the alleles all share a single common starting sequence (just one base is considered)
|
||||
* then the alleles have this leading common base removed as well.
|
||||
*
|
||||
* TODO This code is gross and brittle and needs to be rethought from scratch
|
||||
*
|
||||
* @param start the unadjusted start position (pre-clipping)
|
||||
* @param ref the reference string
|
||||
* @param unclippedAlleles the list of unclipped alleles, including the reference allele
|
||||
* @return the new reference end position of this event
|
||||
*/
|
||||
@Requires({"position > 0", "ref != null && ref.length() > 0", "!unclippedAlleles.isEmpty()"})
|
||||
@Ensures("result != null")
|
||||
public static ClippedAlleles clipAlleles(final int start,
|
||||
final String ref,
|
||||
final List<Allele> unclippedAlleles) {
|
||||
// no variation or single nucleotide events are by definition fully clipped
|
||||
if ( unclippedAlleles.size() == 1 || isSingleNucleotideEvent(unclippedAlleles) )
|
||||
return new ClippedAlleles(start, unclippedAlleles);
|
||||
|
||||
// symbolic alleles aren't unclipped by default, and there can be only two (better than highlander style)
|
||||
if ( unclippedAlleles.size() > 1 ) {
|
||||
if ( unclippedAlleles.get(1).isSymbolic() ) {
|
||||
if ( unclippedAlleles.size() > 2 )
|
||||
return new ClippedAlleles("Detected multiple alleles at a site, including a symbolic one, which is currently unsupported");
|
||||
// we do not unclip symbolic alleles
|
||||
return new ClippedAlleles(start, unclippedAlleles);
|
||||
}
|
||||
}
|
||||
|
||||
// we've got to sort out the clipping by looking at the alleles themselves
|
||||
final int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0));
|
||||
final int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false);
|
||||
|
||||
if ( reverseClipping == -1 )
|
||||
return new ClippedAlleles("computeReverseClipping failed due to bad alleles");
|
||||
|
||||
final List<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
|
||||
for ( final Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() ) {
|
||||
clippedAlleles.add(a);
|
||||
} else {
|
||||
final byte[] allele = Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length - reverseClipping);
|
||||
if ( !Allele.acceptableAlleleBases(allele) )
|
||||
return new ClippedAlleles("Unparsable vcf record with bad allele [" + allele + "]");
|
||||
clippedAlleles.add(Allele.create(allele, a.isReference()));
|
||||
}
|
||||
}
|
||||
|
||||
// the new reference length
|
||||
final int refLength = ref.length() - reverseClipping;
|
||||
final int stop = start + Math.max(refLength - 1, 0);
|
||||
return new ClippedAlleles(stop, clippedAlleles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the alleles in inputVC should have reference bases added for padding
|
||||
*
|
||||
* We need to pad a VC with a common base if the length of the reference allele is
|
||||
* less than the length of the VariantContext. This happens because the position of
|
||||
* e.g. an indel is always one before the actual event (as per VCF convention).
|
||||
*
|
||||
* @param inputVC the VC to evaluate, cannot be null
|
||||
* @return true if
|
||||
*/
|
||||
public static boolean needsPadding(final VariantContext inputVC) {
|
||||
// biallelic sites with only symbolic never need padding
|
||||
if ( inputVC.isBiallelic() && inputVC.getAlternateAllele(0).isSymbolic() )
|
||||
return false;
|
||||
|
||||
final int recordLength = inputVC.getEnd() - inputVC.getStart() + 1;
|
||||
final int referenceLength = inputVC.getReference().length();
|
||||
|
||||
if ( referenceLength == recordLength )
|
||||
return false;
|
||||
else if ( referenceLength == recordLength - 1 )
|
||||
return true;
|
||||
else if ( !inputVC.hasSymbolicAlleles() )
|
||||
throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) +
|
||||
" in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size");
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
public static Allele padAllele(final VariantContext vc, final Allele allele) {
|
||||
assert needsPadding(vc);
|
||||
|
||||
if ( allele.isSymbolic() )
|
||||
return allele;
|
||||
else {
|
||||
// get bases for current allele and create a new one with trimmed bases
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
sb.append((char)vc.getReferenceBaseForIndel().byteValue());
|
||||
sb.append(allele.getDisplayString());
|
||||
final String newBases = sb.toString();
|
||||
return Allele.create(newBases, allele.isReference());
|
||||
}
|
||||
}
|
||||
|
||||
public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC) {
|
||||
final boolean padVC = needsPadding(inputVC);
|
||||
|
||||
// nothing to do if we don't need to pad bases
|
||||
if ( padVC ) {
|
||||
if ( !inputVC.hasReferenceBaseForIndel() )
|
||||
throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available.");
|
||||
|
||||
final ArrayList<Allele> alleles = new ArrayList<Allele>(inputVC.getNAlleles());
|
||||
final Map<Allele, Allele> unpaddedToPadded = new HashMap<Allele, Allele>(inputVC.getNAlleles());
|
||||
|
||||
for (final Allele a : inputVC.getAlleles()) {
|
||||
final Allele padded = padAllele(inputVC, a);
|
||||
alleles.add(padded);
|
||||
unpaddedToPadded.put(a, padded);
|
||||
}
|
||||
|
||||
// now we can recreate new genotypes with trimmed alleles
|
||||
GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples());
|
||||
for (final Genotype g : inputVC.getGenotypes() ) {
|
||||
final List<Allele> newGenotypeAlleles = new ArrayList<Allele>(g.getAlleles().size());
|
||||
for (final Allele a : g.getAlleles()) {
|
||||
newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL);
|
||||
}
|
||||
genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make());
|
||||
|
||||
}
|
||||
|
||||
return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make();
|
||||
}
|
||||
else
|
||||
return inputVC;
|
||||
|
||||
}
|
||||
|
||||
public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) {
|
||||
// see if we need to trim common reference base from all alleles
|
||||
|
||||
final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true);
|
||||
if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 )
|
||||
return inputVC;
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
||||
|
||||
for (final Allele a : inputVC.getAlleles()) {
|
||||
if (a.isSymbolic()) {
|
||||
alleles.add(a);
|
||||
originalToTrimmedAlleleMap.put(a, a);
|
||||
} else {
|
||||
// get bases for current allele and create a new one with trimmed bases
|
||||
final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent);
|
||||
final Allele trimmedAllele = Allele.create(newBases, a.isReference());
|
||||
alleles.add(trimmedAllele);
|
||||
originalToTrimmedAlleleMap.put(a, trimmedAllele);
|
||||
}
|
||||
}
|
||||
|
||||
// now we can recreate new genotypes with trimmed alleles
|
||||
for ( final Genotype genotype : inputVC.getGenotypes() ) {
|
||||
final List<Allele> originalAlleles = genotype.getAlleles();
|
||||
final List<Allele> trimmedAlleles = new ArrayList<Allele>();
|
||||
for ( final Allele a : originalAlleles ) {
|
||||
if ( a.isCalled() )
|
||||
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
|
||||
else
|
||||
trimmedAlleles.add(Allele.NO_CALL);
|
||||
}
|
||||
genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make());
|
||||
}
|
||||
|
||||
return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
public static class ClippedAlleles {
|
||||
final int stop;
|
||||
final List<Allele> clippedAlleles;
|
||||
final String error;
|
||||
|
||||
public ClippedAlleles(final int stop, final List<Allele> clippedAlleles) {
|
||||
this.stop = stop;
|
||||
this.clippedAlleles = clippedAlleles;
|
||||
this.error = null;
|
||||
}
|
||||
|
||||
public ClippedAlleles(final String error) {
|
||||
this.stop = -1;
|
||||
this.clippedAlleles = null;
|
||||
this.error = error;
|
||||
}
|
||||
|
||||
public String getError() {
|
||||
return error;
|
||||
}
|
||||
|
||||
public int getStop() {
|
||||
return stop;
|
||||
}
|
||||
|
||||
public List<Allele> getClippedAlleles() {
|
||||
return clippedAlleles;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -25,6 +25,8 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
|
|
@ -32,6 +34,7 @@ import org.broad.tribble.Feature;
|
|||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
|
|
|
|||
|
|
@ -558,8 +558,8 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
}
|
||||
|
||||
public String getAlleleStringWithRefPadding(final Allele allele) {
|
||||
if ( VariantContextUtils.needsPadding(this) )
|
||||
return VariantContextUtils.padAllele(this, allele).getDisplayString();
|
||||
if ( VCFAlleleClipper.needsPadding(this) )
|
||||
return VCFAlleleClipper.padAllele(this, allele).getDisplayString();
|
||||
else
|
||||
return allele.getDisplayString();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,7 +46,9 @@ public class VariantContextUtils {
|
|||
public final static String MERGE_FILTER_IN_ALL = "FilteredInAll";
|
||||
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
|
||||
public final static String MERGE_FILTER_PREFIX = "filterIn";
|
||||
|
||||
private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
private static Set<String> MISSING_KEYS_WARNED_ABOUT = new HashSet<String>();
|
||||
|
||||
final public static JexlEngine engine = new JexlEngine();
|
||||
public static final int DEFAULT_PLOIDY = 2;
|
||||
|
|
@ -184,83 +186,6 @@ public class VariantContextUtils {
|
|||
return g;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the alleles in inputVC should have reference bases added for padding
|
||||
*
|
||||
* We need to pad a VC with a common base if the length of the reference allele is
|
||||
* less than the length of the VariantContext. This happens because the position of
|
||||
* e.g. an indel is always one before the actual event (as per VCF convention).
|
||||
*
|
||||
* @param inputVC the VC to evaluate, cannot be null
|
||||
* @return true if
|
||||
*/
|
||||
public static boolean needsPadding(final VariantContext inputVC) {
|
||||
final int recordLength = inputVC.getEnd() - inputVC.getStart() + 1;
|
||||
final int referenceLength = inputVC.getReference().length();
|
||||
|
||||
if ( referenceLength == recordLength )
|
||||
return false;
|
||||
else if ( referenceLength == recordLength - 1 )
|
||||
return true;
|
||||
else if ( !inputVC.hasSymbolicAlleles() )
|
||||
throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) +
|
||||
" in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size");
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
public static Allele padAllele(final VariantContext vc, final Allele allele) {
|
||||
assert needsPadding(vc);
|
||||
|
||||
if ( allele.isSymbolic() )
|
||||
return allele;
|
||||
else {
|
||||
// get bases for current allele and create a new one with trimmed bases
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
sb.append((char)vc.getReferenceBaseForIndel().byteValue());
|
||||
sb.append(allele.getDisplayString());
|
||||
final String newBases = sb.toString();
|
||||
return Allele.create(newBases, allele.isReference());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC) {
|
||||
final boolean padVC = needsPadding(inputVC);
|
||||
|
||||
// nothing to do if we don't need to pad bases
|
||||
if ( padVC ) {
|
||||
if ( !inputVC.hasReferenceBaseForIndel() )
|
||||
throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available.");
|
||||
|
||||
final ArrayList<Allele> alleles = new ArrayList<Allele>(inputVC.getNAlleles());
|
||||
final Map<Allele, Allele> unpaddedToPadded = new HashMap<Allele, Allele>(inputVC.getNAlleles());
|
||||
|
||||
for (final Allele a : inputVC.getAlleles()) {
|
||||
final Allele padded = padAllele(inputVC, a);
|
||||
alleles.add(padded);
|
||||
unpaddedToPadded.put(a, padded);
|
||||
}
|
||||
|
||||
// now we can recreate new genotypes with trimmed alleles
|
||||
GenotypesContext genotypes = GenotypesContext.create(inputVC.getNSamples());
|
||||
for (final Genotype g : inputVC.getGenotypes() ) {
|
||||
final List<Allele> newGenotypeAlleles = new ArrayList<Allele>(g.getAlleles().size());
|
||||
for (final Allele a : g.getAlleles()) {
|
||||
newGenotypeAlleles.add( a.isCalled() ? unpaddedToPadded.get(a) : Allele.NO_CALL);
|
||||
}
|
||||
genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make());
|
||||
|
||||
}
|
||||
|
||||
return new VariantContextBuilder(inputVC).alleles(alleles).genotypes(genotypes).make();
|
||||
}
|
||||
else
|
||||
return inputVC;
|
||||
|
||||
}
|
||||
|
||||
private static Set<String> MISSING_KEYS_WARNED_ABOUT = new HashSet<String>();
|
||||
public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
|
||||
VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field);
|
||||
if ( metaData == null ) metaData = header.getInfoHeaderLine(field);
|
||||
|
|
@ -564,7 +489,7 @@ public class VariantContextUtils {
|
|||
for (final VariantContext vc : prepaddedVCs) {
|
||||
// also a reasonable place to remove filtered calls, if needed
|
||||
if ( ! filteredAreUncalled || vc.isNotFiltered() )
|
||||
VCs.add(createVariantContextWithPaddedAlleles(vc));
|
||||
VCs.add(VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc));
|
||||
}
|
||||
if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled
|
||||
return null;
|
||||
|
|
@ -769,7 +694,7 @@ public class VariantContextUtils {
|
|||
return true;
|
||||
}
|
||||
|
||||
public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
|
||||
private static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
|
||||
// see if we need to trim common reference base from all alleles
|
||||
boolean trimVC;
|
||||
|
||||
|
|
@ -780,7 +705,7 @@ public class VariantContextUtils {
|
|||
else if (refAllele.isNull())
|
||||
trimVC = false;
|
||||
else {
|
||||
trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0);
|
||||
trimVC = (VCFAlleleClipper.computeForwardClipping(inputVC.getAlternateAlleles(), (byte) inputVC.getReference().getDisplayString().charAt(0)) > 0);
|
||||
}
|
||||
|
||||
// nothing to do if we don't need to trim bases
|
||||
|
|
@ -836,46 +761,6 @@ public class VariantContextUtils {
|
|||
return inputVC;
|
||||
}
|
||||
|
||||
public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) {
|
||||
// see if we need to trim common reference base from all alleles
|
||||
|
||||
final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1);
|
||||
if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 )
|
||||
return inputVC;
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
||||
|
||||
for (final Allele a : inputVC.getAlleles()) {
|
||||
if (a.isSymbolic()) {
|
||||
alleles.add(a);
|
||||
originalToTrimmedAlleleMap.put(a, a);
|
||||
} else {
|
||||
// get bases for current allele and create a new one with trimmed bases
|
||||
final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent);
|
||||
final Allele trimmedAllele = Allele.create(newBases, a.isReference());
|
||||
alleles.add(trimmedAllele);
|
||||
originalToTrimmedAlleleMap.put(a, trimmedAllele);
|
||||
}
|
||||
}
|
||||
|
||||
// now we can recreate new genotypes with trimmed alleles
|
||||
for ( final Genotype genotype : inputVC.getGenotypes() ) {
|
||||
final List<Allele> originalAlleles = genotype.getAlleles();
|
||||
final List<Allele> trimmedAlleles = new ArrayList<Allele>();
|
||||
for ( final Allele a : originalAlleles ) {
|
||||
if ( a.isCalled() )
|
||||
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
|
||||
else
|
||||
trimmedAlleles.add(Allele.NO_CALL);
|
||||
}
|
||||
genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make());
|
||||
}
|
||||
|
||||
return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
|
||||
GenotypesContext newGs = GenotypesContext.create(genotypes.size());
|
||||
|
||||
|
|
|
|||
|
|
@ -261,10 +261,10 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
}
|
||||
|
||||
private void buildAlleles( VariantContext vc ) throws IOException {
|
||||
final boolean needsPadding = VariantContextUtils.needsPadding(vc);
|
||||
final boolean needsPadding = VCFAlleleClipper.needsPadding(vc);
|
||||
for ( Allele allele : vc.getAlleles() ) {
|
||||
if ( needsPadding )
|
||||
allele = VariantContextUtils.padAllele(vc,allele);
|
||||
allele = VCFAlleleClipper.padAllele(vc, allele);
|
||||
final byte[] s = allele.getDisplayBases();
|
||||
encoder.encodeTypedString(s);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -162,7 +162,7 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
vc = new VariantContextBuilder(vc).noGenotypes().make();
|
||||
|
||||
try {
|
||||
vc = VariantContextUtils.createVariantContextWithPaddedAlleles(vc);
|
||||
vc = VCFAlleleClipper.createVariantContextWithPaddedAlleles(vc);
|
||||
super.add(vc);
|
||||
|
||||
Map<Allele, String> alleleMap = buildAlleleMap(vc);
|
||||
|
|
@ -522,7 +522,7 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
if ( g.hasDP() ) sawDP = true;
|
||||
if ( g.hasAD() ) sawAD = true;
|
||||
if ( g.hasPL() ) sawPL = true;
|
||||
if (g.isFiltered() && g.isCalled()) sawGenotypeFilter = true;
|
||||
if (g.isFiltered()) sawGenotypeFilter = true;
|
||||
}
|
||||
|
||||
if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
|
||||
|
|
|
|||
|
|
@ -15,10 +15,7 @@ import org.testng.SkipException;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -296,6 +293,12 @@ public abstract class BaseTest {
|
|||
assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE);
|
||||
}
|
||||
|
||||
public static final <T> void assertEqualsSet(final Set<T> actual, final Set<T> expected, final String info) {
|
||||
final Set<T> actualSet = new HashSet<T>(actual);
|
||||
final Set<T> expectedSet = new HashSet<T>(expected);
|
||||
Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps
|
||||
}
|
||||
|
||||
public static final void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) {
|
||||
if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately
|
||||
Assert.assertTrue(Double.isNaN(actual));
|
||||
|
|
|
|||
|
|
@ -18,22 +18,21 @@ public class SymbolicAllelesIntegrationTest extends WalkerTest {
|
|||
" --no_cmdline_in_header";
|
||||
}
|
||||
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void test1() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(b36KGReference, "symbolic_alleles_1.vcf"),
|
||||
1,
|
||||
Arrays.asList("c79137da24ad4dc15cedc742de39247f"));
|
||||
Arrays.asList("5bafc5a99ea839e686e55de93f91fd5c"));
|
||||
executeTest("Test symbolic alleles", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void test2() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(b36KGReference, "symbolic_alleles_2.vcf"),
|
||||
1,
|
||||
Arrays.asList("3f6cbbd5fdf164d87081a3af19eeeba7"));
|
||||
Arrays.asList("bf5a09f783ab1fa44774c81f91d10921"));
|
||||
executeTest("Test symbolic alleles mixed in with non-symbolic alleles", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testGenotypeFilters1() {
|
||||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||
baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("c5ed9dd3975b3602293bb484b4fda5f4"));
|
||||
Arrays.asList("060e9e7b6faf8b2f7b3291594eb6b39c"));
|
||||
executeTest("test genotype filter #1", spec1);
|
||||
}
|
||||
|
||||
|
|
@ -88,7 +88,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
|
|||
public void testGenotypeFilters2() {
|
||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
||||
baseTestString() + " -G_filter 'isHomVar == 1' -G_filterName foo --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
|
||||
Arrays.asList("979ccdf484259117aa31305701075602"));
|
||||
Arrays.asList("00f90028a8c0d56772c47f039816b585"));
|
||||
executeTest("test genotype filter #2", spec2);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -65,7 +65,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) +
|
||||
" -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null),
|
||||
Arrays.asList(params.cutVCFMD5));
|
||||
spec.disableShadowBCF(); // TODO -- enable when we support symbolic alleles
|
||||
executeTest("testApplyRecalibration-"+params.inVCF, spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,91 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// our package
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class VCFCodecUnitTest extends BaseTest {
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class AlleleClippingTestProvider extends TestDataProvider {
|
||||
final String ref;
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
final int expectedClip;
|
||||
|
||||
private AlleleClippingTestProvider(final int expectedClip, final String ref, final String ... alleles) {
|
||||
super(AlleleClippingTestProvider.class);
|
||||
this.ref = ref;
|
||||
for ( final String allele : alleles )
|
||||
this.alleles.add(Allele.create(allele));
|
||||
this.expectedClip = expectedClip;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "AlleleClippingTestProvider")
|
||||
public Object[][] MakeAlleleClippingTest() {
|
||||
// pair clipping
|
||||
new AlleleClippingTestProvider(0, "ATT", "CCG");
|
||||
new AlleleClippingTestProvider(1, "ATT", "CCT");
|
||||
new AlleleClippingTestProvider(2, "ATT", "CTT");
|
||||
new AlleleClippingTestProvider(2, "ATT", "ATT"); // cannot completely clip allele
|
||||
|
||||
// triplets
|
||||
new AlleleClippingTestProvider(0, "ATT", "CTT", "CGG");
|
||||
new AlleleClippingTestProvider(1, "ATT", "CTT", "CGT"); // the T can go
|
||||
new AlleleClippingTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go
|
||||
|
||||
return AlleleClippingTestProvider.getTests(AlleleClippingTestProvider.class);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "AlleleClippingTestProvider")
|
||||
public void TestAlleleClipping(AlleleClippingTestProvider cfg) {
|
||||
int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1);
|
||||
Assert.assertEquals(result, cfg.expectedClip);
|
||||
}
|
||||
}
|
||||
|
|
@ -26,7 +26,7 @@ public class VCFIntegrationTest extends WalkerTest {
|
|||
executeTest("Test Variants To VCF from new output", spec2);
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
// See https://getsatisfaction.com/gsa/topics/support_vcf_4_1_structural_variation_breakend_alleles?utm_content=topic_link&utm_medium=email&utm_source=new_topic
|
||||
public void testReadingAndWritingBreakpointAlleles() {
|
||||
String testVCF = privateTestDir + "breakpoint-example.vcf";
|
||||
|
|
|
|||
|
|
@ -0,0 +1,173 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class VCFUtilsUnitTest extends BaseTest {
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test allele clipping
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class ClipAllelesTest extends TestDataProvider {
|
||||
final int position;
|
||||
final int stop;
|
||||
final String ref;
|
||||
List<Allele> inputs;
|
||||
List<Allele> expected;
|
||||
|
||||
@Requires("arg.length % 2 == 0")
|
||||
private ClipAllelesTest(final int position, final int stop, final String ... arg) {
|
||||
super(ClipAllelesTest.class);
|
||||
this.position = position;
|
||||
this.stop = stop;
|
||||
this.ref = arg[0];
|
||||
|
||||
int n = arg.length / 2;
|
||||
inputs = new ArrayList<Allele>(n);
|
||||
expected = new ArrayList<Allele>(n);
|
||||
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
final boolean ref = i % n == 0;
|
||||
inputs.add(Allele.create(arg[i], ref));
|
||||
}
|
||||
for ( int i = n; i < arg.length; i++ ) {
|
||||
final boolean ref = i % n == 0;
|
||||
expected.add(Allele.create(arg[i], ref));
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("ClipAllelesTest input=%s expected=%s", inputs, expected);
|
||||
}
|
||||
}
|
||||
@DataProvider(name = "ClipAllelesTest")
|
||||
public Object[][] makeClipAllelesTest() {
|
||||
// do no harm
|
||||
new ClipAllelesTest(10, 10, "A", "A");
|
||||
new ClipAllelesTest(10, 10, "A", "C", "A", "C");
|
||||
new ClipAllelesTest(10, 10, "A", "C", "G", "A", "C", "G");
|
||||
|
||||
// insertions
|
||||
new ClipAllelesTest(10, 10, "A", "AA", "-", "A");
|
||||
new ClipAllelesTest(10, 10, "A", "AAA", "-", "AA");
|
||||
new ClipAllelesTest(10, 10, "A", "AG", "-", "G");
|
||||
|
||||
// deletions
|
||||
new ClipAllelesTest(10, 11, "AA", "A", "A", "-");
|
||||
new ClipAllelesTest(10, 12, "AAA", "A", "AA", "-");
|
||||
new ClipAllelesTest(10, 11, "AG", "A", "G", "-");
|
||||
new ClipAllelesTest(10, 12, "AGG", "A", "GG", "-");
|
||||
|
||||
// multi-allelic insertion and deletions
|
||||
new ClipAllelesTest(10, 11, "AA", "A", "AAA", "A", "-", "AA");
|
||||
new ClipAllelesTest(10, 11, "AA", "A", "AAG", "A", "-", "AG");
|
||||
new ClipAllelesTest(10, 10, "A", "AA", "AAA", "-", "A", "AA");
|
||||
new ClipAllelesTest(10, 10, "A", "AA", "ACA", "-", "A", "CA");
|
||||
new ClipAllelesTest(10, 12, "ACG", "ATC", "AGG", "CG", "TC", "GG");
|
||||
new ClipAllelesTest(10, 11, "AC", "AT", "AG", "C", "T", "G");
|
||||
|
||||
// cannot be clipped
|
||||
new ClipAllelesTest(10, 11, "AC", "CT", "AG", "AC", "CT", "AG");
|
||||
new ClipAllelesTest(10, 11, "AC", "CT", "GG", "AC", "CT", "GG");
|
||||
|
||||
// symbolic
|
||||
new ClipAllelesTest(10, 10, "A", "<DEL>", "A", "<DEL>");
|
||||
new ClipAllelesTest(50, 50, "G", "G]22:60]", "A", "G]22:60]");
|
||||
new ClipAllelesTest(51, 51, "T", "]22:55]T", "A", "]22:55]T");
|
||||
new ClipAllelesTest(52, 52, "C", "C[22:51[", "A", "C[22:51[");
|
||||
new ClipAllelesTest(60, 60, "A", "A]22:50]", "A", "A]22:50]");
|
||||
|
||||
// complex substitutions
|
||||
new ClipAllelesTest(10, 10, "A", "GA", "A", "GA");
|
||||
|
||||
return ClipAllelesTest.getTests(ClipAllelesTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "ClipAllelesTest")
|
||||
public void testClipAllelesTest(ClipAllelesTest cfg) {
|
||||
final VCFAlleleClipper.ClippedAlleles clipped = VCFAlleleClipper.clipAlleles(cfg.position, cfg.ref, cfg.inputs);
|
||||
Assert.assertNull(clipped.getError(), "Unexpected error occurred");
|
||||
Assert.assertEquals(clipped.getStop(), cfg.stop, "Clipped alleles stop");
|
||||
Assert.assertEquals(clipped.getClippedAlleles(), cfg.expected, "Clipped alleles");
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// basic allele clipping test
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class ReverseClippingPositionTestProvider extends TestDataProvider {
|
||||
final String ref;
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
final int expectedClip;
|
||||
|
||||
private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) {
|
||||
super(ReverseClippingPositionTestProvider.class);
|
||||
this.ref = ref;
|
||||
for ( final String allele : alleles )
|
||||
this.alleles.add(Allele.create(allele));
|
||||
this.expectedClip = expectedClip;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "ReverseClippingPositionTestProvider")
|
||||
public Object[][] makeReverseClippingPositionTestProvider() {
|
||||
// pair clipping
|
||||
new ReverseClippingPositionTestProvider(0, "ATT", "CCG");
|
||||
new ReverseClippingPositionTestProvider(1, "ATT", "CCT");
|
||||
new ReverseClippingPositionTestProvider(2, "ATT", "CTT");
|
||||
new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele
|
||||
|
||||
// triplets
|
||||
new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG");
|
||||
new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go
|
||||
new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go
|
||||
|
||||
return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "ReverseClippingPositionTestProvider")
|
||||
public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) {
|
||||
int result = VCFAlleleClipper.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false);
|
||||
Assert.assertEquals(result, cfg.expectedClip);
|
||||
}
|
||||
}
|
||||
|
|
@ -56,7 +56,7 @@ public class VariantContextTestProvider {
|
|||
final private static boolean ENABLE_VARARRAY_TESTS = true;
|
||||
final private static boolean ENABLE_PLOIDY_TESTS = true;
|
||||
final private static boolean ENABLE_PL_TESTS = true;
|
||||
final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = false;
|
||||
final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = true;
|
||||
final private static boolean ENABLE_SOURCE_VCF_TESTS = true;
|
||||
final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true;
|
||||
final private static List<Integer> TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
|
||||
|
|
@ -70,8 +70,10 @@ public class VariantContextTestProvider {
|
|||
testSourceVCFs.add(new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"));
|
||||
testSourceVCFs.add(new File(BaseTest.privateTestDir + "ex2.vcf"));
|
||||
testSourceVCFs.add(new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf"));
|
||||
if ( ENABLE_SYMBOLIC_ALLELE_TESTS )
|
||||
if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) {
|
||||
testSourceVCFs.add(new File(BaseTest.privateTestDir + "diagnosis_targets_testfile.vcf"));
|
||||
testSourceVCFs.add(new File(BaseTest.privateTestDir + "VQSR.mixedTest.recal"));
|
||||
}
|
||||
}
|
||||
|
||||
public abstract static class VariantContextIOTest {
|
||||
|
|
@ -181,6 +183,7 @@ public class VariantContextTestProvider {
|
|||
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
|
||||
|
||||
addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer);
|
||||
addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String);
|
||||
addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
|
||||
|
|
@ -283,6 +286,15 @@ public class VariantContextTestProvider {
|
|||
|
||||
if ( ENABLE_A_AND_G_TESTS )
|
||||
addGenotypesAndGTests();
|
||||
|
||||
if ( ENABLE_SYMBOLIC_ALLELE_TESTS )
|
||||
addSymbolicAlleleTests();
|
||||
}
|
||||
|
||||
private static void addSymbolicAlleleTests() {
|
||||
// two tests to ensure that the end is computed correctly when there's (and not) an END field present
|
||||
add(builder().alleles("N", "<VQSR>").start(10).stop(11).attribute("END", 11));
|
||||
add(builder().alleles("N", "<VQSR>").start(10).stop(10));
|
||||
}
|
||||
|
||||
private static void addGenotypesToTestData() {
|
||||
|
|
@ -509,27 +521,26 @@ public class VariantContextTestProvider {
|
|||
addGenotypeTests(site, // missing value in varlist of string
|
||||
attr("g1", ref, "FLOAT1", 1.0),
|
||||
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
// TESTING GENOTYPE FILTERS
|
||||
//
|
||||
//
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(),
|
||||
new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make());
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
// TESTING GENOTYPE FILTERS
|
||||
//
|
||||
//
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make());
|
||||
addGenotypeTests(site,
|
||||
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
|
||||
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(),
|
||||
new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make());
|
||||
}
|
||||
|
||||
private static void addGenotypesAndGTests() {
|
||||
|
|
@ -711,14 +722,12 @@ public class VariantContextTestProvider {
|
|||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles");
|
||||
|
||||
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
|
||||
Assert.assertEquals(actual.getFilters(), expected.getFilters(), "filters");
|
||||
BaseTest.assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters");
|
||||
BaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual());
|
||||
|
||||
Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes");
|
||||
if ( expected.hasGenotypes() ) {
|
||||
final Set<String> actualSampleSet = new HashSet<String>(actual.getSampleNames());
|
||||
final Set<String> expectedSampleSet = new HashSet<String>(expected.getSampleNames());
|
||||
Assert.assertTrue(actualSampleSet.equals(expectedSampleSet), "sample names"); // note this is necessary due to testng bug for set comps
|
||||
BaseTest.assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set");
|
||||
Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names");
|
||||
final Set<String> samples = expected.getSampleNames();
|
||||
for ( final String sample : samples ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue