Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2012-04-22 13:23:36 -04:00
commit 35bb55f562
7 changed files with 103 additions and 30 deletions

View File

@ -417,6 +417,11 @@ public class UnifiedGenotyperEngine {
builder.attributes(attributes);
VariantContext vcCall = builder.make();
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
if ( myAlleles.size() != vc.getAlleles().size() )
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup();

View File

@ -189,7 +189,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* or the sample is called reference in this track.
*/
@Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false)
private RodBinding<VariantContext> discordanceTrack;
protected RodBinding<VariantContext> discordanceTrack;
/**
* A site is considered concordant if (1) we are not looking for specific samples and there is a variant called
@ -197,7 +197,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* concordance track and they have the sample genotype call.
*/
@Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false)
private RodBinding<VariantContext> concordanceTrack;
protected RodBinding<VariantContext> concordanceTrack;
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@ -230,10 +230,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
private boolean EXCLUDE_NON_VARIANTS = false;
protected boolean EXCLUDE_NON_VARIANTS = false;
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
private boolean EXCLUDE_FILTERED = false;
protected boolean EXCLUDE_FILTERED = false;
/**
@ -257,23 +257,23 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private Boolean MENDELIAN_VIOLATIONS = false;
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
/**
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
*/
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
private int numRandom = 0;
protected int numRandom = 0;
/**
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
*/
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
private double fractionRandom = 0;
protected double fractionRandom = 0;
@Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false)
private double fractionGenotypes = 0;
protected double fractionGenotypes = 0;
/**
* This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
@ -508,7 +508,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if (!selectedTypes.contains(vc.getType()))
continue;
VariantContext sub = subsetRecord(vc, samples);
VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS);
if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) {
boolean failedJexlMatch = false;
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
@ -645,11 +645,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* @param samples the samples to extract
* @return the subsetted VariantContext
*/
private VariantContext subsetRecord(VariantContext vc, Set<String> samples) {
private VariantContext subsetRecord(final VariantContext vc, final Set<String> samples, final boolean excludeNonVariants) {
if ( samples == null || samples.isEmpty() )
return vc;
final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles());
final VariantContext sub;
if ( excludeNonVariants )
sub = vc.subContextFromSamples(samples); // strip out the alternate alleles that aren't being used
else
sub = vc.subContextFromSamples(samples, vc.getAlleles());
VariantContextBuilder builder = new VariantContextBuilder(sub);
GenotypesContext newGC = sub.getGenotypes();

View File

@ -617,10 +617,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
return true;
}
public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
public static int computeForwardClipping(final List<Allele> unclippedAlleles, final byte ref0) {
boolean clipping = true;
int symbolicAlleleCount = 0;
final byte ref0 = (byte)ref.charAt(0);
for ( Allele a : unclippedAlleles ) {
if ( a.isSymbolic() ) {
@ -638,7 +637,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0;
}
protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) {
public static int computeReverseClipping(final List<Allele> unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) {
int clipping = 0;
boolean stillClipping = true;
@ -650,14 +649,20 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
if ( a.length() - clipping == 0 )
return clipping - 1;
return clipping - (allowFullClip ? 0 : 1);
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) {
stillClipping = false;
else if ( ref.length() == clipping )
generateException("bad alleles encountered", lineNo);
else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) )
}
else if ( ref.length == clipping ) {
if ( allowFullClip )
stillClipping = false;
else
generateException("bad alleles encountered", lineNo);
}
else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) {
stillClipping = false;
}
}
if ( stillClipping )
clipping++;
@ -678,8 +683,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
*/
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
int forwardClipping = computeForwardClipping(unclippedAlleles, ref);
int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo);
int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0));
int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo);
if ( clippedAlleles != null ) {
for ( Allele a : unclippedAlleles ) {

View File

@ -612,7 +612,7 @@ public class VariantContextUtils {
continue;
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
if ( ! genotypes.isEmpty() )
logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
genotypes = stripPLs(genotypes);
// this will remove stale AC,AF attributed from vc
@ -714,8 +714,7 @@ public class VariantContextUtils {
else if (refAllele.isNull())
trimVC = false;
else {
trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
inputVC.getReference().getDisplayString()) > 0);
trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0);
}
// nothing to do if we don't need to trim bases
@ -723,9 +722,6 @@ public class VariantContextUtils {
List<Allele> alleles = new ArrayList<Allele>();
GenotypesContext genotypes = GenotypesContext.create();
// set the reference base for indels in the attributes
Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
for (final Allele a : inputVC.getAlleles()) {
@ -768,12 +764,55 @@ public class VariantContextUtils {
}
final VariantContextBuilder builder = new VariantContextBuilder(inputVC);
return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make();
return builder.alleles(alleles).genotypes(genotypes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make();
}
return inputVC;
}
public static VariantContext reverseTrimAlleles(VariantContext inputVC) {
// see if we need to trim common reference base from all alleles
final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1);
if ( trimExtent <= 0 )
return inputVC;
final List<Allele> alleles = new ArrayList<Allele>();
GenotypesContext genotypes = GenotypesContext.create();
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
for (final Allele a : inputVC.getAlleles()) {
if (a.isSymbolic()) {
alleles.add(a);
originalToTrimmedAlleleMap.put(a, a);
} else {
// get bases for current allele and create a new one with trimmed bases
byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent);
Allele trimmedAllele = Allele.create(newBases, a.isReference());
alleles.add(trimmedAllele);
originalToTrimmedAlleleMap.put(a, trimmedAllele);
}
}
// now we can recreate new genotypes with trimmed alleles
for ( final Genotype genotype : inputVC.getGenotypes() ) {
List<Allele> originalAlleles = genotype.getAlleles();
List<Allele> trimmedAlleles = new ArrayList<Allele>();
for ( final Allele a : originalAlleles ) {
if ( a.isCalled() )
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
else
trimmedAlleles.add(Allele.NO_CALL);
}
genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles));
}
final VariantContextBuilder builder = new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length());
return builder.alleles(alleles).genotypes(genotypes).make();
}
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
GenotypesContext newGs = GenotypesContext.create(genotypes.size());

View File

@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
Arrays.asList("e948543b83bfd0640fcb994d72f8e234"));
Arrays.asList("ec907c65da5ed9b6046404b0f81422d4"));
executeTest("test Multiple SNP alleles", spec);
}
@ -74,6 +74,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("test bad read", spec);
}
@Test
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("a70593bbb5042e2d0e46e3c932cae170"));
executeTest("test reverse trim", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing compressed output

View File

@ -163,4 +163,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testParallelization (4 threads)--" + testfile, spec);
}
@Test
public void testSelectFromMultiAllelic() {
String testfile = validationDataLocation + "multi-allelic.bi-allelicInGIH.vcf";
String samplesFile = validationDataLocation + "GIH.samples.list";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " -o %s -NO_HEADER -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
1,
Arrays.asList("3fb50cc1c955491048108956d7087c35")
);
executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
}
}

View File

@ -85,7 +85,7 @@ public class VCFCodecUnitTest extends BaseTest {
@Test(dataProvider = "AlleleClippingTestProvider")
public void TestAlleleClipping(AlleleClippingTestProvider cfg) {
int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1);
int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1);
Assert.assertEquals(result, cfg.expectedClip);
}
}