Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
35bb55f562
|
|
@ -417,6 +417,11 @@ public class UnifiedGenotyperEngine {
|
|||
builder.attributes(attributes);
|
||||
VariantContext vcCall = builder.make();
|
||||
|
||||
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
||||
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
||||
if ( myAlleles.size() != vc.getAlleles().size() )
|
||||
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
|
||||
|
||||
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {
|
||||
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
||||
final ReadBackedPileup pileup = rawContext.getBasePileup();
|
||||
|
|
|
|||
|
|
@ -189,7 +189,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* or the sample is called reference in this track.
|
||||
*/
|
||||
@Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false)
|
||||
private RodBinding<VariantContext> discordanceTrack;
|
||||
protected RodBinding<VariantContext> discordanceTrack;
|
||||
|
||||
/**
|
||||
* A site is considered concordant if (1) we are not looking for specific samples and there is a variant called
|
||||
|
|
@ -197,7 +197,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* concordance track and they have the sample genotype call.
|
||||
*/
|
||||
@Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false)
|
||||
private RodBinding<VariantContext> concordanceTrack;
|
||||
protected RodBinding<VariantContext> concordanceTrack;
|
||||
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter vcfWriter = null;
|
||||
|
|
@ -230,10 +230,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();
|
||||
|
||||
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
|
||||
private boolean EXCLUDE_NON_VARIANTS = false;
|
||||
protected boolean EXCLUDE_NON_VARIANTS = false;
|
||||
|
||||
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
|
||||
private boolean EXCLUDE_FILTERED = false;
|
||||
protected boolean EXCLUDE_FILTERED = false;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -257,23 +257,23 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
private Boolean MENDELIAN_VIOLATIONS = false;
|
||||
|
||||
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
||||
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
||||
|
||||
/**
|
||||
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
|
||||
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
|
||||
*/
|
||||
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
|
||||
private int numRandom = 0;
|
||||
protected int numRandom = 0;
|
||||
|
||||
/**
|
||||
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
|
||||
*/
|
||||
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
|
||||
private double fractionRandom = 0;
|
||||
protected double fractionRandom = 0;
|
||||
|
||||
@Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false)
|
||||
private double fractionGenotypes = 0;
|
||||
protected double fractionGenotypes = 0;
|
||||
|
||||
/**
|
||||
* This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
|
||||
|
|
@ -508,7 +508,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
if (!selectedTypes.contains(vc.getType()))
|
||||
continue;
|
||||
|
||||
VariantContext sub = subsetRecord(vc, samples);
|
||||
VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS);
|
||||
if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) {
|
||||
boolean failedJexlMatch = false;
|
||||
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
|
||||
|
|
@ -645,11 +645,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
* @param samples the samples to extract
|
||||
* @return the subsetted VariantContext
|
||||
*/
|
||||
private VariantContext subsetRecord(VariantContext vc, Set<String> samples) {
|
||||
private VariantContext subsetRecord(final VariantContext vc, final Set<String> samples, final boolean excludeNonVariants) {
|
||||
if ( samples == null || samples.isEmpty() )
|
||||
return vc;
|
||||
|
||||
final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles());
|
||||
final VariantContext sub;
|
||||
if ( excludeNonVariants )
|
||||
sub = vc.subContextFromSamples(samples); // strip out the alternate alleles that aren't being used
|
||||
else
|
||||
sub = vc.subContextFromSamples(samples, vc.getAlleles());
|
||||
VariantContextBuilder builder = new VariantContextBuilder(sub);
|
||||
|
||||
GenotypesContext newGC = sub.getGenotypes();
|
||||
|
|
|
|||
|
|
@ -617,10 +617,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
|||
return true;
|
||||
}
|
||||
|
||||
public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
|
||||
public static int computeForwardClipping(final List<Allele> unclippedAlleles, final byte ref0) {
|
||||
boolean clipping = true;
|
||||
int symbolicAlleleCount = 0;
|
||||
final byte ref0 = (byte)ref.charAt(0);
|
||||
|
||||
for ( Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() ) {
|
||||
|
|
@ -638,7 +637,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
|||
return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0;
|
||||
}
|
||||
|
||||
protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) {
|
||||
public static int computeReverseClipping(final List<Allele> unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) {
|
||||
int clipping = 0;
|
||||
boolean stillClipping = true;
|
||||
|
||||
|
|
@ -650,14 +649,20 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
|||
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
|
||||
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
|
||||
if ( a.length() - clipping == 0 )
|
||||
return clipping - 1;
|
||||
return clipping - (allowFullClip ? 0 : 1);
|
||||
|
||||
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
|
||||
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) {
|
||||
stillClipping = false;
|
||||
else if ( ref.length() == clipping )
|
||||
generateException("bad alleles encountered", lineNo);
|
||||
else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) )
|
||||
}
|
||||
else if ( ref.length == clipping ) {
|
||||
if ( allowFullClip )
|
||||
stillClipping = false;
|
||||
else
|
||||
generateException("bad alleles encountered", lineNo);
|
||||
}
|
||||
else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) {
|
||||
stillClipping = false;
|
||||
}
|
||||
}
|
||||
if ( stillClipping )
|
||||
clipping++;
|
||||
|
|
@ -678,8 +683,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
|||
*/
|
||||
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
||||
|
||||
int forwardClipping = computeForwardClipping(unclippedAlleles, ref);
|
||||
int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo);
|
||||
int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0));
|
||||
int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo);
|
||||
|
||||
if ( clippedAlleles != null ) {
|
||||
for ( Allele a : unclippedAlleles ) {
|
||||
|
|
|
|||
|
|
@ -612,7 +612,7 @@ public class VariantContextUtils {
|
|||
continue;
|
||||
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
|
||||
if ( ! genotypes.isEmpty() )
|
||||
logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
|
||||
logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
|
||||
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
|
||||
genotypes = stripPLs(genotypes);
|
||||
// this will remove stale AC,AF attributed from vc
|
||||
|
|
@ -714,8 +714,7 @@ public class VariantContextUtils {
|
|||
else if (refAllele.isNull())
|
||||
trimVC = false;
|
||||
else {
|
||||
trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
|
||||
inputVC.getReference().getDisplayString()) > 0);
|
||||
trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0);
|
||||
}
|
||||
|
||||
// nothing to do if we don't need to trim bases
|
||||
|
|
@ -723,9 +722,6 @@ public class VariantContextUtils {
|
|||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
|
||||
// set the reference base for indels in the attributes
|
||||
Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
|
||||
|
||||
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
||||
|
||||
for (final Allele a : inputVC.getAlleles()) {
|
||||
|
|
@ -768,12 +764,55 @@ public class VariantContextUtils {
|
|||
}
|
||||
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(inputVC);
|
||||
return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make();
|
||||
return builder.alleles(alleles).genotypes(genotypes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make();
|
||||
}
|
||||
|
||||
return inputVC;
|
||||
}
|
||||
|
||||
public static VariantContext reverseTrimAlleles(VariantContext inputVC) {
|
||||
// see if we need to trim common reference base from all alleles
|
||||
|
||||
final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1);
|
||||
if ( trimExtent <= 0 )
|
||||
return inputVC;
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
|
||||
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
||||
|
||||
for (final Allele a : inputVC.getAlleles()) {
|
||||
if (a.isSymbolic()) {
|
||||
alleles.add(a);
|
||||
originalToTrimmedAlleleMap.put(a, a);
|
||||
} else {
|
||||
// get bases for current allele and create a new one with trimmed bases
|
||||
byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent);
|
||||
Allele trimmedAllele = Allele.create(newBases, a.isReference());
|
||||
alleles.add(trimmedAllele);
|
||||
originalToTrimmedAlleleMap.put(a, trimmedAllele);
|
||||
}
|
||||
}
|
||||
|
||||
// now we can recreate new genotypes with trimmed alleles
|
||||
for ( final Genotype genotype : inputVC.getGenotypes() ) {
|
||||
|
||||
List<Allele> originalAlleles = genotype.getAlleles();
|
||||
List<Allele> trimmedAlleles = new ArrayList<Allele>();
|
||||
for ( final Allele a : originalAlleles ) {
|
||||
if ( a.isCalled() )
|
||||
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
|
||||
else
|
||||
trimmedAlleles.add(Allele.NO_CALL);
|
||||
}
|
||||
genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles));
|
||||
}
|
||||
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length());
|
||||
return builder.alleles(alleles).genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
|
||||
GenotypesContext newGs = GenotypesContext.create(genotypes.size());
|
||||
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("e948543b83bfd0640fcb994d72f8e234"));
|
||||
Arrays.asList("ec907c65da5ed9b6046404b0f81422d4"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -74,6 +74,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
executeTest("test bad read", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReverseTrim() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
|
||||
Arrays.asList("a70593bbb5042e2d0e46e3c932cae170"));
|
||||
executeTest("test reverse trim", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing compressed output
|
||||
|
|
|
|||
|
|
@ -163,4 +163,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
|
||||
executeTest("testParallelization (4 threads)--" + testfile, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSelectFromMultiAllelic() {
|
||||
String testfile = validationDataLocation + "multi-allelic.bi-allelicInGIH.vcf";
|
||||
String samplesFile = validationDataLocation + "GIH.samples.list";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b37KGReference + " -o %s -NO_HEADER -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
|
||||
1,
|
||||
Arrays.asList("3fb50cc1c955491048108956d7087c35")
|
||||
);
|
||||
executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -85,7 +85,7 @@ public class VCFCodecUnitTest extends BaseTest {
|
|||
|
||||
@Test(dataProvider = "AlleleClippingTestProvider")
|
||||
public void TestAlleleClipping(AlleleClippingTestProvider cfg) {
|
||||
int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1);
|
||||
int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1);
|
||||
Assert.assertEquals(result, cfg.expectedClip);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue