Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2012-09-12 12:23:24 -04:00
commit faad2972d6
5 changed files with 124 additions and 4 deletions

View File

@ -40,6 +40,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
@ -325,6 +326,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(doc="indel size select",required=false,fullName="maxIndelSize")
private int maxIndelSize = Integer.MAX_VALUE;
@Argument(doc="Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
/* Private class used to store the intermediate variants in the integer random selection process */
private static class RandomVariantStructure {
@ -386,10 +390,31 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);
// first, add any requested samples
samples.addAll(samplesFromFile);
samples.addAll(samplesFromExpressions);
// first, check overlap between requested and present samples
Set<String> commandLineUniqueSamples = new HashSet<String>(samplesFromFile.size()+samplesFromExpressions.size()+sampleNames.size());
commandLineUniqueSamples.addAll(samplesFromFile);
commandLineUniqueSamples.addAll(samplesFromExpressions);
commandLineUniqueSamples.addAll(sampleNames);
commandLineUniqueSamples.removeAll(vcfSamples);
// second, add the requested samples
samples.addAll(sampleNames);
samples.addAll(samplesFromExpressions);
samples.addAll(samplesFromFile);
logger.debug(Utils.join(",",commandLineUniqueSamples));
if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) {
logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored.");
samples.removeAll(commandLineUniqueSamples);
} else if (commandLineUniqueSamples.size() > 0 ) {
throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s",
"Samples entered on command line (through -sf or -sn) that are not present in the VCF.",
"A list of these samples:",
Utils.join(",",commandLineUniqueSamples),
"To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES"));
}
// if none were requested, we want all of them
if ( samples.isEmpty() ) {

View File

@ -7,7 +7,9 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -15,6 +17,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -278,7 +281,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getFlippedEncoding(Genotype g, int offset) {
byte b;
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
if ( ! checkGQIsGood(g) ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_VAR;
@ -293,6 +296,16 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
return (byte) (b << (2*offset));
}
private boolean checkGQIsGood(Genotype genotype) {
if ( genotype.hasGQ() ) {
return genotype.getGQ() >= minGenotypeQuality;
} else if ( genotype.hasLikelihoods() ) {
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
}
return false;
}
private static String getID(VariantContext v) {
if ( v.hasID() ) {
return v.getID();

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
public class GenotypeLikelihoods {
private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5;
@ -167,10 +168,36 @@ public class GenotypeLikelihoods {
//Return the neg log10 Genotype Quality (GQ) for the given genotype
//Returns Double.NEGATIVE_INFINITY in case of missing genotype
/**
* This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context.
* Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List<Allele>) in place of it.
*
* If you **know** you're biallelic, use getGQLog10FromLikelihoods directly.
* @param genotype - actually a genotype type (no call, hom ref, het, hom var)
* @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best).
*/
@Deprecated
public double getLog10GQ(GenotypeType genotype){
return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector());
}
@Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"})
private double getLog10GQ(List<Allele> genotypeAlleles,List<Allele> contextAlleles) {
int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0));
int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1));
int plIndex = calculatePLindex(allele1Index,allele2Index);
return getGQLog10FromLikelihoods(plIndex,getAsVector());
}
public double getLog10GQ(Genotype genotype, List<Allele> vcAlleles ) {
return getLog10GQ(genotype.getAlleles(),vcAlleles);
}
public double getLog10GQ(Genotype genotype, VariantContext context) {
return getLog10GQ(genotype,context.getAlleles());
}
public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){
if(likelihoods == null)
return Double.NEGATIVE_INFINITY;

View File

@ -70,6 +70,20 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testComplexSelection--" + testfile, spec);
}
@Test
public void testComplexSelectionWithNonExistingSamples() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile),
1,
Arrays.asList("4386fbb258dcef4437495a37f5a83c53")
);
spec.disableShadowBCF();
executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec);
}
@Test
public void testNonExistingFieldSelection() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
@ -98,6 +112,21 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testSampleExclusion--" + testfile, spec);
}
@Test
public void testSampleInclusionWithNonexistingSamples() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile,
1,
UserException.BadInput.class
);
spec.disableShadowBCF();
executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec);
}
@Test
public void testConcordance() {

View File

@ -29,12 +29,15 @@ package org.broadinstitute.sting.utils.variantcontext;
// the imports for unit testing.
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
/**
@ -44,6 +47,7 @@ public class GenotypeLikelihoodsUnitTest {
double [] v = new double[]{-10.5, -1.25, -5.11};
final static String vGLString = "-10.50,-1.25,-5.11";
final static String vPLString = "93,0,39";
double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC
@Test
public void testFromVector2() {
@ -139,6 +143,28 @@ public class GenotypeLikelihoodsUnitTest {
}
}
// this test is completely broken, the method is wrong.
public void testGetQualFromLikelihoodsMultiAllelicBroken() {
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
double actualGQ = gl.getLog10GQ(GenotypeType.HET);
double expectedGQ = 1.6;
Assert.assertEquals(actualGQ,expectedGQ);
}
public void testGetQualFromLikelihoodsMultiAllelic() {
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
Allele ref = Allele.create(BaseUtils.A,true);
Allele alt1 = Allele.create(BaseUtils.C);
Allele alt2 = Allele.create(BaseUtils.T);
List<Allele> allAlleles = Arrays.asList(ref,alt1,alt2);
List<Allele> gtAlleles = Arrays.asList(alt1,alt2);
GenotypeBuilder gtBuilder = new GenotypeBuilder();
gtBuilder.alleles(gtAlleles);
double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles);
double expectedGQ = 1.6;
Assert.assertEquals(actualGQ,expectedGQ);
}
private void assertDoubleArraysAreEqual(double[] v1, double[] v2) {
Assert.assertEquals(v1.length, v2.length);
for ( int i = 0; i < v1.length; i++ ) {