Merge remote-tracking branch 'unstable/master'

This commit is contained in:
Geraldine Van der Auwera 2015-07-09 15:17:03 -04:00
commit 8ea4dcab8d
55 changed files with 829 additions and 302 deletions

View File

@ -13,7 +13,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>public/gatk-root</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -75,10 +75,18 @@ import java.util.*;
/**
* Variant confidence normalized by unfiltered depth of variant samples
*
* <p>This annotation puts the variant confidence QUAL score in perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.</p>
* <p>This annotation puts the variant confidence QUAL score into perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.</p>
*
* <h3>Statistical notes</h3>
* <p>The calculation only takes into account coverage from samples genotyped as having the variant allele(s). This removes the influence of any homozygous-reference samples that might be present in the same cohort, which would otherwise penalize the call unfairly.</p>
* <p>The QD is the QUAL score normalized by allele depth (AD) for a variant. For a single sample, the HaplotypeCaller calculates the QD by taking QUAL/AD. For multiple samples, HaplotypeCaller and GenotypeGVCFs calculate the QD by taking QUAL/AD of samples with a non hom-ref genotype call. The reason we leave out the samples with a hom-ref call is to not penalize the QUAL for the other samples with the variant call.</p>
* <p>Here is a single sample example:</p>
* <pre>2 37629 . C G 1063.77 . AC=2;AF=1.00;AN=2;DP=31;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=58.50;QD=34.32;SOR=2.376 GT:AD:DP:GQ:PL:QSS 1/1:0,31:31:93:1092,93,0:0,960</pre>
<p>QUAL/AD = 1063.77/31 = 34.32 = QD</p>
* <p>Here is a multi-sample example:</p>
* <pre>10 8046 . C T 4107.13 . AC=1;AF=0.167;AN=6;BaseQRankSum=-3.717;DP=1063;FS=1.616;MLEAC=1;MLEAF=0.167;QD=11.54
GT:AD:DP:GQ:PL:QSS 0/0:369,4:373:99:0,1007,12207:10548,98 0/0:331,1:332:99:0,967,11125:9576,27 0/1:192,164:356:99:4138,0,5291:5501,4505</pre>
* <p>QUAL/AD = 4107.13/356 = 11.54 = QD</p>
* <p>Note that currently, when HaplotypeCaller is run with `-ERC GVCF`, the QD calculation is invoked before AD itself has been calculated, due to a technical constraint. In that case, HaplotypeCaller uses the number of overlapping reads from the haplotype likelihood calculation in place of AD to calculate QD, which generally yields a very similar number. This does not cause any measurable problems, but can cause some confusion since the number may be slightly different than what you would expect to get if you did the calculation manually. For that reason, this behavior will be modified in an upcoming version.</p>
*
* <h3>Caveat</h3>
* <p>This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.</p>

View File

@ -250,12 +250,8 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
final List<Allele> outputAlleles = outputAlternativeAlleles.outputAlleles(vc.getReference());
final VariantContextBuilder builder = new VariantContextBuilder(callSourceString(), loc.getContig(), loc.getStart(), loc.getStop(), outputAlleles);
// Seems that when log10PError is 0.0, you must pass -0.0 to get a nice output at the other end otherwise is a "-0".
// Truth is that this should be fixed in the "variant" dependency code but perhaps it can be amended also in the VariantContextWriter.
//TODO Please remove this comment when this has been fixed (PT https://www.pivotaltracker.com/story/show/69492530)
//TODO and change the code below accordingly.
builder.log10PError(log10Confidence == 0.0 ? -0.0 : log10Confidence);
builder.log10PError(log10Confidence);
if ( ! passesCallThreshold(phredScaledConfidence) )
builder.filter(GATKVCFConstants.LOW_QUAL_FILTER_NAME);

View File

@ -1204,9 +1204,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
/**
* Is writing to an output GVCF file?
*
* @return true if the VCF output file has a .g.vcf extension
* @return true if the VCF output file has a .g.vcf or .g.vcf.gz extension
*/
private boolean isGVCF() {
return ((VariantContextWriterStub) vcfWriter).getOutputFile().getName().endsWith("." + GATKVCFUtils.GVCF_EXT);
String fileName = ((VariantContextWriterStub) vcfWriter).getOutputFile().getName();
return ( fileName.endsWith("." + GATKVCFUtils.GVCF_EXT) || fileName.endsWith("." + GATKVCFUtils.GVCF_GZ_EXT) );
}
}

View File

@ -85,7 +85,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
private final boolean doPhysicalPhasing;
protected final boolean doPhysicalPhasing;
private final GenotypingModel genotypingModel;

View File

@ -363,7 +363,7 @@ class PhasingUtils {
*
* @param gt1 genotype 1
* @param gt2 genotype 2
* @return true if genotypes have the same number of chromosomes, haplotype, number of attributes
* @return true if genotypes have the same number of chromosomes, haplotype, phased, number of attributes
* as chromosomes, and either genotype is homozygous, false otherwise
*/
static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
@ -371,6 +371,10 @@ class PhasingUtils {
if (gt1.getPloidy() != gt2.getPloidy())
return false;
// If gt1 or gt2 are not phased, then can not be merged.
if (!gt1.isPhased() || !gt2.isPhased())
return false;
// If gt1 or gt2 are homozygous, then could be merged.
if (gt1.isHom() || gt2.isHom())
return true;

View File

@ -99,7 +99,7 @@ import java.util.*;
* as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array (in humans). This adaptive
* error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
* probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is
* the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
* the log odds of being a true variant versus being false under the trained Gaussian mixture model.
* </p>
*
* <p>VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the

View File

@ -165,7 +165,6 @@ public class CombineGVCFs extends RodWalker<CombineGVCFs.PositionalState, Combin
// take care of the VCF headers
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location"));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags
final Set<String> samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

View File

@ -219,7 +219,6 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
// add headers for annotations added by this tool
headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location"));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));
@ -302,7 +301,13 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
* @return true if it has proper alternate alleles, false otherwise
*/
private boolean isProperlyPolymorphic(final VariantContext vc) {
return ( vc != null && !vc.isSymbolic() );
return ( vc != null &&
!vc.getAlternateAlleles().isEmpty() &&
(!vc.isBiallelic() ||
(!vc.getAlternateAllele(0).equals(Allele.SPAN_DEL) &&
!vc.getAlternateAllele(0).equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED))
)
);
}
/**

View File

@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.MathUtils;
@ -126,14 +127,15 @@ public class ReferenceConfidenceVariantContextMerger {
final boolean isSpanningEvent = loc.getStart() != vc.getStart();
// record whether it's also a spanning deletion/event (we know this because the VariantContext type is no
// longer "symbolic" but "mixed" because there are real alleles mixed in with the symbolic non-ref allele)
sawSpanningDeletion |= ( isSpanningEvent && vc.isMixed() ) || vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE);
sawSpanningDeletion |= ( isSpanningEvent && vc.isMixed() ) || vc.getAlternateAlleles().contains(Allele.SPAN_DEL) ||
vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED );
sawNonSpanningEvent |= ( !isSpanningEvent && vc.isMixed() );
vcAndNewAllelePairs.add(new Pair<>(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet)));
}
// Add <DEL> and <NON_REF> to the end if at all required in in the output.
if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE);
if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(Allele.SPAN_DEL);
if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
@ -286,10 +288,15 @@ public class ReferenceConfidenceVariantContextMerger {
for (final Allele a : vc.getAlternateAlleles()) {
if (a.isSymbolic()) {
result.add(a);
// we always skip <NON_REF> when adding to finalAlleles; this is done outside if applies.
// we also skip <*DEL> if there isn't a real alternate allele.
// we always skip <NON_REF> when adding to finalAlleles; this is done outside if it applies.
// we also skip <*:DEL> if there isn't a real alternate allele.
if ( !a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && !vc.isSymbolic() )
finalAlleles.add(a);
} else if ( a == Allele.SPAN_DEL ) {
result.add(a);
// we skip * if there isn't a real alternate allele.
if ( !vc.isBiallelic() )
finalAlleles.add(a);
} else if (a.isCalled()) {
final Allele newAllele;
if (extraBaseCount > 0) {
@ -328,7 +335,7 @@ public class ReferenceConfidenceVariantContextMerger {
if ( allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) )
replacement = allele;
else if ( allele.length() < vc.getReference().length() )
replacement = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE;
replacement = Allele.SPAN_DEL;
else
replacement = Allele.NO_CALL;
@ -430,13 +437,72 @@ public class ReferenceConfidenceVariantContextMerger {
// create the index mapping, using the <NON-REF> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
final Allele targetAllele = targetAlleles.get(i);
// if theres more than 1 DEL allele then we need to use the best one
if ( targetAllele == Allele.SPAN_DEL && g.hasPL() ) {
final int occurrences = Collections.frequency(remappedAlleles, Allele.SPAN_DEL);
if ( occurrences > 1 ) {
final int indexOfBestDel = indexOfBestDel(remappedAlleles, g.getPL(), g.getPloidy());
indexMapping[i] = ( indexOfBestDel == -1 ? indexOfNonRef : indexOfBestDel );
continue;
}
}
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAllele);
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
return indexMapping;
}
/**
* Returns the index of the best spanning deletion allele based on AD counts
*
* @param alleles the list of alleles
* @param PLs the list of corresponding PL values
* @param ploidy the ploidy of the sample
* @return the best index or -1 if not found
*/
private static int indexOfBestDel(final List<Allele> alleles, final int[] PLs, final int ploidy) {
int bestIndex = -1;
int bestPL = Integer.MAX_VALUE;
for ( int i = 0; i < alleles.size(); i++ ) {
if ( alleles.get(i) == Allele.SPAN_DEL ) {
final int homAltIndex = findHomIndex(i, ploidy, alleles.size());
final int PL = PLs[homAltIndex];
if ( PL < bestPL ) {
bestIndex = i;
bestPL = PL;
}
}
}
return bestIndex;
}
/**
* Returns the index of the PL that represents the homozygous genotype of the given i'th allele
*
* @param i the index of the allele with the list of alleles
* @param ploidy the ploidy of the sample
* @param numAlleles the total number of alleles
* @return the hom index
*/
private static int findHomIndex(final int i, final int ploidy, final int numAlleles) {
// some quick optimizations for the common case
if ( ploidy == 2 )
return i + (i * (i + 1) / 2); // this is straight from the VCF spec on PLs
if ( ploidy == 1 )
return i;
final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(ploidy, numAlleles);
final int[] alleleIndexes = new int[ploidy];
Arrays.fill(alleleIndexes, i);
return calculator.allelesToIndex(alleleIndexes);
}
/**
* Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current
* alleles from the original AD.

View File

@ -147,11 +147,19 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
@Test
public void testInvertFilter() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invert_filter_expression", 1,
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invertFilterExpression", 1,
Arrays.asList("d478fd6bcf0884133fe2a47adf4cd765"));
executeTest("test inversion of selection of filter with separate names #2", spec);
}
@Test
public void testInvertJexlFilter() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --filterName ABF -filter 'AlleleBalance >= 0.7' --filterName FSF -filter 'FisherStrand != 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("6fa6cd89bfc8b6b4dfc3da25eb36d08b")); // Differs from testInvertFilter() because their VCF header FILTER description uses the -filter argument. Their filter statuses are identical.
executeTest("test inversion of selection of filter via JEXL with separate names #2", spec);
}
@Test
public void testGenotypeFilters1() {
WalkerTestSpec spec1 = new WalkerTestSpec(
@ -207,8 +215,40 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testInvertGenotypeFilterExpression() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invert_genotype_filter_expression", 1,
+ " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invertGenotypeFilterExpression", 1,
Arrays.asList("d2664870e7145eb73a2295766482c823"));
executeTest("testInvertGenotypeFilterExpression", spec);
}
@Test
public void testInvertJexlGenotypeFilterExpression() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP >= 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1,
Arrays.asList("8ddd8f3b5ee351c4ab79cb186b1d45ba")); // Differs from testInvertFilter because FILTER description uses the -genotypeFilterExpression argument
executeTest("testInvertJexlGenotypeFilterExpression", spec);
}
@Test
public void testSetFilteredGtoNocall() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference
+ " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf --setFilteredGtToNocall", 1,
Arrays.asList("9ff801dd726eb4fc562b278ccc6854b1"));
executeTest("testSetFilteredGtoNocall", spec);
}
@Test
public void testSetVcfFilteredGtoNocall() {
String testfile = privateTestDir + "filteredSamples.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("81b99386a64a8f2b857a7ef2bca5856e")
);
spec.disableShadowBCF();
executeTest("testSetVcfFilteredGtoNocall--" + testfile, spec);
}
}

View File

@ -129,7 +129,7 @@ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest {
eventLength = 5;
alleles = getConsensusAlleles(eventLength,true,10,0.1, altBases);
Assert.assertEquals(alleles.size(),0);
Assert.assertEquals(alleles.size(),2);
}

View File

@ -224,6 +224,18 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
executeTest("testGVCFIndexNoThrow", spec);
}
/**
* Test HaplotypeCaller to ensure it does not throw an exception when a .g.vcf.gz output file is specified and the indexing arguments are omitted
*/
@Test()
public void testGVCFGzIndexNoThrow() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000000-17000100");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(GATKVCFUtils.GVCF_GZ_EXT), Arrays.asList(""));
spec.disableShadowBCF();
executeTest("testGVCFIndexNoThrow", spec);
}
@Test()
public void testWrongParameterGVCFIndexException() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",

View File

@ -81,6 +81,7 @@ public class PhasingUtilsUnitTest extends BaseTest {
private ReferenceSequenceFile referenceFile;
private Genotype genotype1;
private Genotype genotype2;
private Genotype genotype2unphased;
private String contig;
private List<Allele> alleleList1;
private List<Allele> alleleList2;
@ -93,8 +94,9 @@ public class PhasingUtilsUnitTest extends BaseTest {
genomeLocParser = new GenomeLocParser(referenceFile);
alleleList1 = Arrays.asList(Allele.create("T", true), Allele.create("C", false));
alleleList2 = Arrays.asList(Allele.create("G", true), Allele.create("A", false));
genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).make();
genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).make();
genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).phased(true).make();
genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(true).make();
genotype2unphased = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).phased(false).make();
contig = new String("1");
vc1 = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype1).make();
vc2 = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype2).make();
@ -184,6 +186,7 @@ public class PhasingUtilsUnitTest extends BaseTest {
@Test
public void TestAlleleSegregationIsKnown(){
Assert.assertTrue(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2));
Assert.assertFalse(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2unphased));
}
@Test

View File

@ -138,7 +138,21 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
+ " -L chr20:332341-802503",
1,
Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574"));
executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec);
executeTest("Use trio-phased VCF, adding read-backed phasing information in HP tag (as is now standard for RBP) [TEST SEVEN]", spec);
}
@Test
public void testDoNotMergeUnphasedSNPs() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ReadBackedPhasing" +
" -R " + hg19Reference +
" -I " + privateTestDir + "S17C1-8.KRAS.bam" +
" --variant " + privateTestDir + "S17C1-8_bwa_mutect_filtered.vcf" +
" --phaseQualityThresh 20.0 -enableMergeToMNP -maxDistMNP 3 -L 12:25398281-25398284" +
" -o %s" +
" --no_cmdline_in_header",
1,
Arrays.asList("59ee67d657ee955477bca94d07014ac3"));
executeTest("Do not merge unphased SNPs", spec);
}
}

View File

@ -94,14 +94,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
"41e2d951a17de433fe378bb3d9ec75d4", // tranches
"04336b2453202f286da05b69e57f66ed", // recal file
"d29fd0bdc1c8c3a171e10d29f7ffeaec"); // cut VCF
"19c77724f08d90896914d3d348807399", // recal file
"c6a186a1a9271f5de35f1e5aeb8749a6"); // cut VCF
VRTest lowPassPlusExomes = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf",
"ce4bfc6619147fe7ce1f8331bbeb86ce", // tranches
"1b33c10be7d8bf8e9accd11113835262", // recal file
"4700d52a06f2ef3a5882719b86911e51"); // cut VCF
"b7cad6a0bbbf0330e0ac712a80c3144f", // recal file
"bee399765991636461599565c9634bcf"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {
@ -196,8 +196,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
"3ad7f55fb3b072f373cbce0b32b66df4", // tranches
"e747c08131d58d9a4800720f6ca80e0c", // recal file
"e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF
"e91a5b25ea1eefdcff488e0326028b51", // recal file
"e6a0c5173d8c8fbd08afdc5e5e7d3a78"); // cut VCF
@DataProvider(name = "VRBCFTest")
public Object[][] createVRBCFTest() {
@ -251,14 +251,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest indelUnfiltered = new VRTest(
validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as .
"9a331328370889168a7aa3a625f73620", // tranches
"2cbbd146d68c40200b782e0226f71976", // recal file
"64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF
"689c7853fe2e63216da3b0d47e27740e", // recal file
"4147373ec8e0aba7ace3658677007990"); // cut VCF
VRTest indelFiltered = new VRTest(
validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS
"9a331328370889168a7aa3a625f73620", // tranches
"2cbbd146d68c40200b782e0226f71976", // recal file
"c0ec662001e829f5779a9d13b1d77d80"); // cut VCF
"689c7853fe2e63216da3b0d47e27740e", // recal file
"8dd8ea31e419f68d80422b34b14e24e4"); // cut VCF
@DataProvider(name = "VRIndelTest")
public Object[][] createTestVariantRecalibratorIndel() {
@ -316,7 +316,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -o %s" +
" -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" +
" -recalFile " + privateTestDir + "VQSR.mixedTest.recal",
Arrays.asList("03a0ed00af6aac76d39e569f90594a02"));
Arrays.asList("cd42484985179c7f549e652f0f6a94d0"));
final List<File> outputFiles = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).getFirst();
setPDFsForDeletion(outputFiles);
}

View File

@ -100,7 +100,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals",
1,
Arrays.asList("ebe26077809961f53d5244643d24fd45"));
Arrays.asList("7b3153135e4f8e1d137d3f4beb46f182"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -112,7 +112,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals",
1,
Arrays.asList("2d36a5f996cad47e5d05fcd78f6e572e"));
Arrays.asList("4f546634213ece6f08ec9258620b92bb"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -190,7 +190,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testMD5s() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("83ea9f4a9aadb1218c21c9d3780e8009"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b7c753452ab0c05f9cee538e420b87fa"));
spec.disableShadowBCF();
executeTest("testMD5s", spec);
}
@ -198,7 +198,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBasepairResolutionOutput() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --convertToBasePairResolution");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f153cb6e986efc9b50f0b8833fe5d3da"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("bb6420ead95da4c72e76ca4bf5860ef0"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionOutput", spec);
}
@ -206,7 +206,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBreakBlocks() throws Exception {
final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6626ff272e7e76fba091f5bde4a1f963"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("dd31182124c4b78a8a03edb1e0cf618b"));
spec.disableShadowBCF();
executeTest("testBreakBlocks", spec);
}
@ -217,16 +217,49 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf",
1,
Arrays.asList("fba48ce2bf8761366ff2cd0b45d0421f"));
Arrays.asList("58984edf9a3a92c9fc97039b97755861"));
spec.disableShadowBCF();
executeTest("testSpanningDeletions", spec);
}
@Test
public void testMultipleSpanningDeletionsForOneSample() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.many.g.vcf",
1,
Arrays.asList("5c88e10211def13ba847c29d0fe9e191"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsForOneSample", spec);
}
@Test
public void testMultipleSpanningDeletionsForOneSampleHaploid() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.many.haploid.g.vcf",
1,
Arrays.asList("76fc5f6b949ac0b893061828af800bf8"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsForOneSampleHaploid", spec);
}
@Test
public void testMultipleSpanningDeletionsForOneSampleTetraploid() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.many.tetraploid.g.vcf",
1,
Arrays.asList("0ec79471550ec5e30540f68cb0651b14"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsForOneSampleTetraploid", spec);
}
@Test
public void testWrongReferenceBaseBugFix() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf"
+ " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input2.vcf") + " -o %s --no_cmdline_in_header");
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("331c1a4a6a72ea1617c1697a5d945d56"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("c0fdba537399cf28b28771963e2c5174"));
spec.disableShadowBCF();
executeTest("testWrongReferenceBaseBugFix",spec);
@ -235,7 +268,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testBasepairResolutionInput() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("207e89b5677fbf0ef4d1ff768262cf0c"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6aeb88ca94cb5223f26175da72b985f2"));
spec.disableShadowBCF();
executeTest("testBasepairResolutionInput", spec);
}

View File

@ -223,4 +223,18 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
executeTest("combiningGVCFsFails", spec);
} catch (Exception e) { } // do nothing
}
@Test
public void combineSymbolicVariants() {
// Just checking that this does not fail, hence no output files and MD5
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineVariants --no_cmdline_in_header -o %s "
+ " -R " + hg19RefereneWithChrPrefixInChromosomeNames
+ " -V " + privateTestDir + "WES-chr1.DEL.vcf"
+ " -V " + privateTestDir + "WGS-chr1.DEL.vcf"
+ " -genotypeMergeOptions UNIQUIFY",
0,
Arrays.asList(""));
executeTest("combineSymbolicVariants: ", spec);
}
}

View File

@ -82,7 +82,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
1,
Arrays.asList("4dfea9a9b1a77c4c6b9edc61f9ea8da2"));
Arrays.asList("23ff3e22262929138ca1f00fc111cadf"));
executeTest("testUpdatePGT", spec);
}
@ -91,7 +91,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample", b37KGReference),
1,
Arrays.asList("a96b79e7c3689c8d5506083cb6d27390"));
Arrays.asList("88fa4a021e4aac9a0e48bd54b2949ece"));
executeTest("testUpdatePGT, adding StrandAlleleCountsBySample annotation", spec);
}
@ -103,7 +103,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("bf3c1982ab6ffee410cb6a1fff6e7105"));
Arrays.asList("06b4e2589c5b903f7c51ae9968bebe77"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -115,7 +115,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference),
1,
Arrays.asList("47d454936dc1f17cf4c4f84f02841346"));
Arrays.asList("599394c205c1d6641b9bebabbd29e13c"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -127,7 +127,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" +
" -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference),
1,
Arrays.asList("5d79ea9de8ada8520d01284cf0c9f720"));
Arrays.asList("f7d5344a85e6d7fc2437d4253b424cb0"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -139,7 +139,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
1,
Arrays.asList("d69b43cac448f45218e77308fc01e9e6"));
Arrays.asList("c9e4d1e52ee1f3a5233f1fb100f24d5e"));
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
}
@ -152,7 +152,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("7c93d82758bfb6e7efec257ef8a46217"));
Arrays.asList("aa19980b9a525afed43e98c821114ae5"));
executeTest("combineSingleSamplePipelineGVCFHierarchical", spec);
}
@ -164,7 +164,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("5b60a7a9575ea83407aa61123960a0cc"));
Arrays.asList("f23c9d62542a69b5cbf0e9f89fdd235d"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}
@ -174,7 +174,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference +
" -V " + privateTestDir + "gvcfExample1.vcf",
1,
Arrays.asList("9e59b94c84dd673b8db9d35cae7e0f68"));
Arrays.asList("d602d9e5d336798e4ccb52d2b5f91677"));
executeTest("testJustOneSample", spec);
}
@ -185,14 +185,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V " + privateTestDir + "gvcfExample1.vcf" +
" -V " + privateTestDir + "gvcfExample2.vcf",
1,
Arrays.asList("8407cb9a1ab34e705e5a54a0d4146d84"));
Arrays.asList("6c6d6ef90386eb6c6ed649379aac0c13"));
executeTest("testSamplesWithDifferentLs", spec);
}
@Test(enabled = true)
public void testNoPLsException() {
// Test with input files with (1) 0/0 and (2) ./.
final String md5 = "3e69805dc1c0ada0a050a65b89ecab30";
final String md5 = "d04b32cf2fa97d303ff7fdc779a653d4";
WalkerTestSpec spec1 = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference +
" --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf",
@ -212,7 +212,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-nda"),
1,
Arrays.asList("5a036de16b7a87626d2b76727376d9df"));
Arrays.asList("7132a43d93a9855d03b27b4b0381194c"));
executeTest("testNDA", spec);
}
@ -221,7 +221,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-maxAltAlleles 1"),
1,
Arrays.asList("2f3e6879fa27128a8be7b067ded78966"));
Arrays.asList("07844593a4e1ff1110ef8c1de42cc290"));
executeTest("testMaxAltAlleles", spec);
}
@ -230,7 +230,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"),
1,
Arrays.asList("2e4a1ad71e8fc127b594077166c0344b"));
Arrays.asList("56caad762b26479ba5e2cc99222b9030"));
executeTest("testStandardConf", spec);
}
@ -274,7 +274,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" +
" --uniquifySamples", b37KGReference),
1,
Arrays.asList("9a472c4e101fff4892efb9255c5cd8b3"));
Arrays.asList("ba36b36145e038e3cb004adf11bce96c"));
executeTest("testUniquifiedSamples", spec);
}
@ -446,7 +446,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
}
private static final String simpleSpanningDeletionsMD5 = "e8616a396d40b4918ad30189856ceb01";
private static final String simpleSpanningDeletionsMD5 = "1cf4ea1da40306741ec4b9a5fe1568b9";
@Test(enabled = true)
public void testSpanningDeletionsMD5() {
@ -476,7 +476,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf",
1,
Arrays.asList("1c418229117bc8f148a69eda9c496309"));
Arrays.asList("0aa7ceae6af1dc4fda6732e978ace864"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsMD5", spec);
}
@ -487,8 +487,31 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.delOnly.g.vcf",
1,
Arrays.asList("02cca337e097b86c5471929036ad4b64"));
spec.disableShadowBCF();
executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec);
}
@Test(enabled = true)
public void testDeprecatedSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.depr.delOnly.g.vcf",
1,
Arrays.asList("46169d08f93e5ff57856c7b64717314b"));
spec.disableShadowBCF();
executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec);
}
@Test(enabled = true)
public void testGenotypingSpanningDeletionOverSpan() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.delOverSpan.1.g.vcf -V " +
privateTestDir + "spanningDel.delOverSpan.2.g.vcf",
0,
Arrays.asList("")); // we do not care about the md5; we just want to make sure it doesn't blow up with an error
spec.disableShadowBCF();
executeTest("testGenotypingSpanningDeletionOverSpan", spec);
}
}

View File

@ -64,8 +64,10 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args;
}
private static final String sampleExclusionMD5 = "eea22fbf1e490e59389a663c3d6a6537";
private static final String invertSelectionMD5 = "831bc0a5a723b0681a910d668ff3757b";
private static final String SAMPLE_EXCLUSION_MD5 = "eea22fbf1e490e59389a663c3d6a6537";
private static final String INVERT_SELECTION_MD5 = "831bc0a5a723b0681a910d668ff3757b";
private static final String MAX_FILTERED_GT_SELECTION_MD5 = "0365de1bbf7c037be00badace0a74d02";
private static final String MIN_FILTERED_GT_SELECTION_MD5 = "fcee8c8caa0696a6675961bb12664878";
@Test
public void testDiscordanceNoSampleSpecified() {
@ -199,7 +201,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_se '[CDH]' --variant " + testfile,
1,
Arrays.asList(sampleExclusionMD5)
Arrays.asList(SAMPLE_EXCLUSION_MD5)
);
spec.disableShadowBCF();
@ -216,7 +218,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -se '[^CDH]' --variant " + testfile,
1,
Arrays.asList(sampleExclusionMD5)
Arrays.asList(SAMPLE_EXCLUSION_MD5)
);
spec.disableShadowBCF();
@ -557,7 +559,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
}
/**
* Test inverting the variant selection criteria
* Test inverting the variant selection criteria by the -invertSelect argument
*/
@Test
public void testInvertSelection() {
@ -565,16 +567,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' --invert_selection --variant " + testfile),
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' -invertSelect --variant " + testfile),
1,
Arrays.asList(invertSelectionMD5)
Arrays.asList(INVERT_SELECTION_MD5)
);
spec.disableShadowBCF();
executeTest("testInvertSelection--" + testfile, spec);
}
/**
* Test inverting the variant JEXL selection criteria
* Test inverting the variant selection criteria by inverting the JEXL expression logic following -select
*/
@Test
public void testInvertJexlSelection() {
@ -584,7 +586,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP >= 20000'--variant " + testfile),
1,
Arrays.asList(invertSelectionMD5)
Arrays.asList(INVERT_SELECTION_MD5)
);
spec.disableShadowBCF();
executeTest("testInvertJexlSelection--" + testfile, spec);
@ -659,10 +661,76 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
String pedFile = privateTestDir + "CEUtrio.ped";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -inv_mv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header",
"-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -invMv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("35921fb2dedca0ead83027a66b725794"));
executeTest("testMendelianViolationSelection--" + testFile, spec);
}
@Test
public void testMaxFilteredGenotypesSelection() {
String testfile = privateTestDir + "filteredSamples.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --maxFilteredGenotypes 1 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList(MAX_FILTERED_GT_SELECTION_MD5)
);
spec.disableShadowBCF();
executeTest("testMaxFilteredGenotypesSelection--" + testfile, spec);
}
@Test
public void testMinFilteredGenotypesSelection() {
String testfile = privateTestDir + "filteredSamples.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --minFilteredGenotypes 2 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList(MIN_FILTERED_GT_SELECTION_MD5)
);
spec.disableShadowBCF();
executeTest("testMinFilteredGenotypesSelection--" + testfile, spec);
}
@Test
public void testMaxFractionFilteredGenotypesSelection() {
String testfile = privateTestDir + "filteredSamples.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --maxFractionFilteredGenotypes 0.4 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList(MAX_FILTERED_GT_SELECTION_MD5)
);
spec.disableShadowBCF();
executeTest("testMaxFractionFilteredGenotypesSelection--" + testfile, spec);
}
@Test
public void testMinFractionFilteredGenotypesSelection() {
String testfile = privateTestDir + "filteredSamples.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --minFractionFilteredGenotypes 0.6 -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList(MIN_FILTERED_GT_SELECTION_MD5)
);
spec.disableShadowBCF();
executeTest("testMinFractionFilteredGenotypesSelection--" + testfile, spec);
}
@Test
public void testSetFilteredGtoNocall() {
String testfile = privateTestDir + "filteredSamples.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants --setFilteredGtToNocall -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("81b99386a64a8f2b857a7ef2bca5856e")
);
spec.disableShadowBCF();
executeTest("testSetFilteredGtoNocall--" + testfile, spec);
}
}

View File

@ -95,7 +95,7 @@ public class VariantContextMergerUnitTest extends BaseTest {
ATCATCT = Allele.create("ATCATCT");
ATref = Allele.create("AT",true);
Anoref = Allele.create("A",false);
del = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE;
del = Allele.SPAN_DEL;
GT = Allele.create("GT",false);
genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference)));
}

View File

@ -67,23 +67,6 @@ import java.util.ArrayList;
*/
public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testVariantsToVCFUsingDbsnpInput() {
List<String> md5 = new ArrayList<String>();
md5.add("72e6ce7aff7dec7ca9e7580be7ddd435");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant:OldDbsnp " + GATKDataLocation + "Comparisons/Validated/dbSNP/dbsnp_129_b36.rod" +
" -T VariantsToVCF" +
" -L 1:1-30,000,000" +
" -o %s" +
" --no_cmdline_in_header",
1, // just one output file
md5);
executeTest("testVariantsToVCFUsingDbsnpInput", spec).getFirst();
}
@Test
public void testVariantsToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../public/gatk-root</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../../public/gatk-root</relativePath>
</parent>

View File

@ -9,7 +9,7 @@
<name>External Example</name>
<properties>
<gatk.version>3.4</gatk.version>
<gatk.version>3.5-SNAPSHOT</gatk.version>
<!--
gatk.basedir property must point to your checkout of GATK/GATK until we can get all the
dependencies out of the committed gatk repo and into central.

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -69,8 +69,9 @@ public class GATKVCFUtils {
public final static GATKVCFIndexType DEFAULT_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR;
public final static Integer DEFAULT_GVCF_INDEX_PARAMETER = 128000;
// GVCF file extension
// GVCF file extensions
public final static String GVCF_EXT = "g.vcf";
public final static String GVCF_GZ_EXT = "g.vcf.gz";
// Message for using the deprecated --variant_index_type or --variant_index_parameter arguments.
public final static String DEPRECATED_INDEX_ARGS_MSG = "Naming your output file using the .g.vcf extension will automatically set the appropriate values " +
@ -389,7 +390,7 @@ public class GATKVCFUtils {
indexType = variantIndexType;
indexParameter = variantIndexParameter;
logger.warn(DEPRECATED_INDEX_ARGS_MSG);
} else if (outputFile.getName().endsWith("." + GVCF_EXT)) {
} else if (outputFile.getName().endsWith("." + GVCF_EXT) || outputFile.getName().endsWith("." + GVCF_GZ_EXT)) {
indexType = DEFAULT_GVCF_INDEX_TYPE;
indexParameter = DEFAULT_GVCF_INDEX_PARAMETER;
}

View File

@ -0,0 +1,74 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.engine.filters;
import htsjdk.samtools.*;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.ReadProperties;
import org.broadinstitute.gatk.utils.ValidationExclusion;
import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource;
import org.broadinstitute.gatk.utils.exceptions.UserException;
/**
* Filter out reads that are over-soft-clipped
*
* <p>
* This filter is intended to filter out reads that are potentially from foreign organisms.
* From experience with sequencing of human DNA we have found cases of contamination by bacterial
* organisms; the symptoms of such contamination are a class of reads with only a small number
* of aligned bases and additionally many soft-clipped bases on both ends. This filter is intended
* to remove such reads.
* </p>
*
*/
public class OverclippedReadFilter extends ReadFilter {
@Argument(fullName = "filter_is_too_short_value", shortName = "filterTooShort", doc = "Value for which reads with less than this number of aligned bases is considered too short", required = false)
int tooShort = 30;
public boolean filterOut(final SAMRecord read) {
boolean sawLeadingSoftclip = false;
boolean sawAlignedBase = false;
int alignedLength = 0;
for ( final CigarElement element : read.getCigar().getCigarElements() ) {
if ( element.getOperator() == CigarOperator.S ) {
if ( sawAlignedBase ) // if this is true then we must also have seen a leading soft-clip
return (alignedLength < tooShort);
sawLeadingSoftclip = true;
} else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above)
if ( !sawLeadingSoftclip )
return false;
sawAlignedBase = true;
alignedLength += element.getLength();
}
}
return false;
}
}

View File

@ -38,7 +38,7 @@ public class CramIntegrationTest extends WalkerTest {
@DataProvider(name="cramData")
public Object[][] getCRAMData() {
return new Object[][] {
{"PrintReads", "exampleBAM.bam", "", "cram", "026ebc00c2a8f9832e37f1a6a0f53521"},
{"PrintReads", "exampleBAM.bam", "", "cram", "fc6e3919a8a34266c89ef66e97ceaba9"},
//{"PrintReads", "exampleCRAM.cram", "", "cram", "026ebc00c2a8f9832e37f1a6a0f53521"}, https://github.com/samtools/htsjdk/issues/148
{"PrintReads", "exampleCRAM.cram", "", "bam", "99e5f740b43594a5b8e5bc1a007719e0"},
{"PrintReads", "exampleCRAM-noindex.cram", "", "bam", "99e5f740b43594a5b8e5bc1a007719e0"},

View File

@ -0,0 +1,77 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.engine.filters;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.TextCigarCodec;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
/**
* Tests for the OverclippedReadFilter
*/
public class OverclippedReadFilterUnitTest extends ReadFilterTest {
@Test(enabled = true, dataProvider= "OverclippedDataProvider")
public void testOverclippedFilter(final String cigarString, final boolean expectedResult) {
final OverclippedReadFilter filter = new OverclippedReadFilter();
final SAMRecord read = buildSAMRecord(cigarString);
Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString);
}
private SAMRecord buildSAMRecord(final String cigarString) {
final Cigar cigar = TextCigarCodec.decode(cigarString);
return this.createRead(cigar, 1, 0, 10);
}
@DataProvider(name= "OverclippedDataProvider")
public Iterator<Object[]> overclippedDataProvider() {
final List<Object[]> result = new LinkedList<Object[]>();
result.add(new Object[] { "1S10M1S", true });
result.add(new Object[] { "1S10X1S", true });
result.add(new Object[] { "1H1S10M1S1H", true });
result.add(new Object[] { "1S40M1S", false });
result.add(new Object[] { "1S40X1S", false });
result.add(new Object[] { "1H10M1S", false });
result.add(new Object[] { "1S10M1H", false });
result.add(new Object[] { "10M1S", false });
result.add(new Object[] { "1S10M", false });
result.add(new Object[] { "1S10M10D10M1S", true });
result.add(new Object[] { "1S1M40I1S", false });
result.add(new Object[] { "1S10I1S", true });
result.add(new Object[] { "1S40I1S", false });
return result.iterator();
}
}

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -77,7 +77,7 @@ class QGraphSettings {
var jobReportFile: String = _
@Advanced
@Argument(fullName="disableJobReport", shortName="disabpleJobReport", doc="If provided, we will not create a job report", required=false)
@Argument(fullName="disableJobReport", shortName="disableJobReport", doc="If provided, we will not create a job report", required=false)
var disableJobReport: Boolean = false
@ArgumentCollection

View File

@ -12,7 +12,7 @@
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<packaging>pom</packaging>
<name>GATK Root</name>
@ -44,8 +44,8 @@
<test.listeners>org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.gatk.utils.TestNGTestTransformer,org.broadinstitute.gatk.utils.GATKTextReporter,org.uncommons.reportng.HTMLReporter</test.listeners>
<!-- Version numbers for picard and htsjdk -->
<htsjdk.version>1.132</htsjdk.version>
<picard.version>1.131</picard.version>
<htsjdk.version>1.134</htsjdk.version>
<picard.version>1.133</picard.version>
</properties>
<!-- Dependency configuration (versions, etc.) -->

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -137,6 +137,9 @@ public class CoverageUtils {
public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) {
Map<SAMReadGroupRecord, int[]> countsByRG = new HashMap<SAMReadGroupRecord,int[]>();
Map<String, int[]> countsByRGName = new HashMap<String, int[]>();
Map<String, SAMReadGroupRecord> RGByName = new HashMap<String, SAMReadGroupRecord>();
List<PileupElement> countPileup = new LinkedList<PileupElement>();
FragmentCollection<PileupElement> fpile;
@ -202,10 +205,20 @@ public class CoverageUtils {
for (PileupElement e : countPileup) {
SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
if (!countsByRG.keySet().contains(readGroup))
countsByRG.put(readGroup, new int[6]);
updateCounts(countsByRG.get(readGroup), e);
String readGroupId = readGroup.getSample() + "_" + readGroup.getReadGroupId();
int[] counts = countsByRGName.get(readGroupId);
if (counts == null) {
counts = new int[6];
countsByRGName.put(readGroupId, counts);
RGByName.put(readGroupId, readGroup);
}
updateCounts(counts, e);
}
for (String readGroupId : RGByName.keySet()) {
countsByRG.put(RGByName.get(readGroupId), countsByRGName.get(readGroupId));
}
return countsByRG;

View File

@ -179,15 +179,21 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
/**
* Invert the selection criteria for --filterExpression
*/
@Argument(fullName="invert_filter_expression", shortName="inv_fil_sel", doc="Invert the selection criteria for --filterExpression", required=false)
@Argument(fullName="invertFilterExpression", shortName="invfilter", doc="Invert the selection criteria for --filterExpression", required=false)
protected boolean invertFilterExpression = false;
/**
* Invert the selection criteria for --genotypeFilterExpression
*/
@Argument(fullName="invert_genotype_filter_expression", shortName="inv_gen_fil_sel", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
@Argument(fullName="invertGenotypeFilterExpression", shortName="invG_filter", doc="Invert the selection criteria for --genotypeFilterExpression", required=false)
protected boolean invertGenotypeFilterExpression = false;
/**
* If this argument is provided, set filtered genotypes to no-call (./.).
*/
@Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call")
private boolean setFilteredGenotypesToNocall = false;
// JEXL expressions for the filters
List<VariantContextUtils.JexlVCMatchExp> filterExps;
List<VariantContextUtils.JexlVCMatchExp> genotypeFilterExps;
@ -201,8 +207,10 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
private static final int WINDOW_SIZE = 10; // 10 variants on either end of the current one
private ArrayList<FiltrationContext> windowInitializer = new ArrayList<FiltrationContext>();
private final List<Allele> diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
/**
* Prepend inverse phrase to description if --invert_filter_expression
* Prepend inverse phrase to description if --invertFilterExpression
*
* @param description the description
* @return the description with inverse prepended if --invert_filter_expression
@ -379,7 +387,7 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
final VariantContextBuilder builder = new VariantContextBuilder(vc);
// make new Genotypes based on filters
if ( !genotypeFilterExps.isEmpty() ) {
if ( !genotypeFilterExps.isEmpty() || setFilteredGenotypesToNocall ) {
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
// for each genotype, check filters then create a new object
@ -388,12 +396,17 @@ public class VariantFiltration extends RodWalker<Integer, Integer> {
final List<String> filters = new ArrayList<String>();
if ( g.isFiltered() ) filters.add(g.getFilters());
// Add if expression filters the variant context
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( Utils.invertLogic(VariantContextUtils.match(vc, g, exp), invertGenotypeFilterExpression) )
filters.add(exp.name);
}
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
// if sample is filtered and --setFilteredGtToNocall, set genotype to non-call
if ( !filters.isEmpty() && setFilteredGenotypesToNocall )
genotypes.add(new GenotypeBuilder(g).filters(filters).alleles(diploidNoCallAlleles).make());
else
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
} else {
genotypes.add(g);
}

View File

@ -146,7 +146,7 @@ import java.util.*;
* -o output.vcf \
* -se 'SAMPLE.+PARC' \
* -select "QD > 10.0"
* --invert_selection
* -invertSelect
* </pre>
*
* <h4>Select a sample and exclude non-variant loci and filtered loci (trim remaining alleles by default):</h4>
@ -207,7 +207,7 @@ import java.util.*;
* -sn mySample
* </pre>
*
* <h4>Generating a VCF of all the variants that are mendelian violations. The optional argument `-mvq` restricts the selection to sites that have a QUAL score of 50 or more</h4>
* <h4>Generating a VCF of all the variants that are mendelian violations. The optional argument '-mvq' restricts the selection to sites that have a QUAL score of 50 or more</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
@ -218,14 +218,14 @@ import java.util.*;
* -o violations.vcf
* </pre>
*
* <h4>Generating a VCF of all the variants that are not mendelian violations. The optional argument `-mvq` together with '-inv_mv' restricts the selection to sites that have a QUAL score of 50 or less</h4>
* <h4>Generating a VCF of all the variants that are not mendelian violations. The optional argument '-mvq' together with '-invMv' restricts the selection to sites that have a QUAL score of 50 or less</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T SelectVariants \
* -R reference.fasta \
* -V input.vcf \
* -ped family.ped \
* -mv -mvq 50 -inv_mv \
* -mv -mvq 50 -invMv \
* -o violations.vcf
* </pre>
*
@ -275,7 +275,7 @@ import java.util.*;
*
* <h4>Select IDs in fileKeep and exclude IDs in fileExclude:</h4>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
@ -284,9 +284,35 @@ import java.util.*;
* -excludeIDs fileExclude
* </pre>
*
* <h4>Select sites where there are between 2 and 5 samples and between 10 and 50 percent of the sample genotypes are filtered:</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* --maxFilteredGenotypes 5
* --minFilteredGenotypes 2
* --maxFractionFilteredGenotypes 0.60
* --minFractionFilteredGenotypes 0.10
* </pre>
*
* <h4>Set filtered genotypes to no-call (./.):</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* --setFilteredGenotypesToNocall
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
public class SelectVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
static final int MAX_FILTERED_GENOTYPES_DEFAULT_VALUE = Integer.MAX_VALUE;
static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0;
static final double MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 1.0;
static final double MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 0.0;
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
@ -359,10 +385,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
public ArrayList<String> selectExpressions = new ArrayList<>();
/**
* Invert the selection criteria for --select.
* Invert the selection criteria for -select.
*/
@Argument(fullName="invert_selection", shortName="inv_sel", doc="Invert the selection criteria for --select", required=false)
protected boolean invertSelection = false;
@Argument(shortName="invertSelect", doc="Invert the selection criteria for -select", required=false)
protected boolean invertSelect = false;
/*
* If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none
@ -436,8 +462,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* determined on the basis of family structure. Requires passing a pedigree file using the engine-level
* `-ped` argument.
*/
@Argument(fullName="invert_mendelianViolation", shortName="inv_mv", doc="Output non-mendelian violation sites only", required=false)
private Boolean invert_mendelianViolations = false;
@Argument(fullName="invertMendelianViolation", shortName="invMv", doc="Output non-mendelian violation sites only", required=false)
private Boolean invertMendelianViolations = false;
/**
* This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order
@ -514,6 +540,36 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="minIndelSize", required=false, doc="Minimum size of indels to include")
private int minIndelSize = 0;
/**
* If this argument is provided, select sites where at most a maximum number of samples are filtered at the genotype level.
*/
@Argument(fullName="maxFilteredGenotypes", required=false, doc="Maximum number of samples filtered at the genotype level")
private int maxFilteredGenotypes = MAX_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where at least a minimum number of samples are filtered at the genotype level.
*/
@Argument(fullName="minFilteredGenotypes", required=false, doc="Minimum number of samples filtered at the genotype level")
private int minFilteredGenotypes = MIN_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where a fraction or less of the samples are filtered at the genotype level.
*/
@Argument(fullName="maxFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level")
private double maxFractionFilteredGenotypes = MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where a fraction or more of the samples are filtered at the genotype level.
*/
@Argument(fullName="minFractionFilteredGenotypes", required=false, doc="Maximum fraction of samples filtered at the genotype level")
private double minFractionFilteredGenotypes = MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, set filtered genotypes to no-call (./.).
*/
@Argument(fullName="setFilteredGtToNocall", required=false, doc="Set filtered genotypes to no-call")
private boolean setFilteredGenotypesToNocall = false;
@Hidden
@Argument(fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES", required=false, doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.")
private boolean allowNonOverlappingCommandLineSamples = false;
@ -547,6 +603,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private Set<String> IDsToRemove = null;
private Map<String, VCFHeader> vcfRods;
private final List<Allele> diploidNoCallAlleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
/**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
*/
@ -619,7 +677,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
selectedTypes.add(t);
}
// remove specifed types
// remove specified types
for (VariantContext.Type t : typesToExclude)
selectedTypes.remove(t);
@ -713,16 +771,16 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
for (VariantContext vc : vcs) {
// an option for performance testing only
if ( fullyDecode )
vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() );
if (fullyDecode)
vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing());
if ( IDsToKeep != null && !IDsToKeep.contains(vc.getID()) )
if (IDsToKeep != null && !IDsToKeep.contains(vc.getID()))
continue;
if ( IDsToRemove != null && IDsToRemove.contains(vc.getID()) )
if (IDsToRemove != null && IDsToRemove.contains(vc.getID()))
continue;
if ( mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invert_mendelianViolations) )
if (mendelianViolations && Utils.invertLogic(mv.countViolations(this.getSampleDB().getFamilies(samples), vc) == 0, invertMendelianViolations))
break;
if (discordanceOnly) {
@ -745,20 +803,30 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if (!selectedTypes.contains(vc.getType()))
continue;
if ( containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize) )
if (containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize))
continue;
if ( needNumFilteredGenotypes()) {
int numFilteredSamples = numFilteredGenotypes(vc);
double fractionFilteredGenotypes = samples.isEmpty() ? 0.0 : numFilteredSamples / samples.size();
if (numFilteredSamples > maxFilteredGenotypes || numFilteredSamples < minFilteredGenotypes ||
fractionFilteredGenotypes > maxFractionFilteredGenotypes || fractionFilteredGenotypes < minFractionFilteredGenotypes)
continue;
}
VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates);
VariantContext filteredGenotypeToNocall = setFilteredGenotypeToNocall(sub, setFilteredGenotypesToNocall);
// Not excluding non-variants or subsetted polymorphic variants AND including filtered loci or subsetted variant is not filtered
if ( (!XLnonVariants || sub.isPolymorphicInSamples()) && (!XLfiltered || !sub.isFiltered()) ) {
if ( (!XLnonVariants || filteredGenotypeToNocall.isPolymorphicInSamples()) && (!XLfiltered || !filteredGenotypeToNocall.isFiltered()) ) {
// Write the subsetted variant if it matches all of the expressions
boolean failedJexlMatch = false;
try {
for (VariantContextUtils.JexlVCMatchExp jexl : jexls) {
if ( Utils.invertLogic(!VariantContextUtils.match(sub, jexl), invertSelection) ){
if ( Utils.invertLogic(!VariantContextUtils.match(filteredGenotypeToNocall, jexl), invertSelect) ){
failedJexlMatch = true;
break;
}
@ -771,7 +839,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( !failedJexlMatch &&
!justRead &&
( !selectRandomFraction || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) {
vcfWriter.add(sub);
vcfWriter.add(filteredGenotypeToNocall);
}
}
}
@ -800,6 +868,26 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
return false;
}
/**
* Find the number of filtered samples
*
* @param vc the variant rod VariantContext
* @return number of filtered samples
*/
private int numFilteredGenotypes(final VariantContext vc){
if (vc == null)
return 0;
int numFiltered = 0;
// check if we find it in the variant rod
GenotypesContext genotypes = vc.getGenotypes(samples);
for (final Genotype g : genotypes)
if ( g.isFiltered() && !g.getFilters().isEmpty())
numFiltered++;
return numFiltered;
}
/**
* Checks if vc has a variant call for (at least one of) the samples.
*
@ -807,7 +895,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* @param compVCs the comparison VariantContext (discordance)
* @return true VariantContexts are discordant, false otherwise
*/
private boolean isDiscordant (VariantContext vc, Collection<VariantContext> compVCs) {
private boolean isDiscordant (final VariantContext vc, final Collection<VariantContext> compVCs) {
if (vc == null)
return false;
@ -845,7 +933,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* @param compVCs the comparison VariantContext
* @return true if VariantContexts are concordant, false otherwise
*/
private boolean isConcordant (VariantContext vc, Collection<VariantContext> compVCs) {
private boolean isConcordant (final VariantContext vc, final Collection<VariantContext> compVCs) {
if (vc == null || compVCs == null || compVCs.isEmpty())
return false;
@ -876,7 +964,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
return true;
}
private boolean sampleHasVariant(Genotype g) {
private boolean sampleHasVariant(final Genotype g) {
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !XLfiltered)));
}
@ -945,8 +1033,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
for ( Genotype genotype : newGC ) {
//Set genotype to no call if it falls in the fraction.
if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){
final List<Allele> alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make());
genotypes.add(new GenotypeBuilder(genotype).alleles(diploidNoCallAlleles).noGQ().make());
}
else{
genotypes.add(genotype);
@ -966,6 +1053,30 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
return trimmed;
}
/**
* If --setFilteredGtToNocall, set filtered genotypes to no-call
*
* @param vc the VariantContext record to set filtered genotypes to no-call
* @param filteredGenotypesToNocall set filtered genotypes to non-call?
* @return the VariantContext with no-call genotypes if the sample was filtered
*/
private VariantContext setFilteredGenotypeToNocall(final VariantContext vc, final boolean filteredGenotypesToNocall) {
if ( !filteredGenotypesToNocall )
return vc;
final VariantContextBuilder builder = new VariantContextBuilder(vc);
final GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.isCalled() && g.isFiltered() )
genotypes.add(new GenotypeBuilder(g).alleles(diploidNoCallAlleles).make());
else
genotypes.add(g);
}
return builder.genotypes(genotypes).make();
}
/*
* Add annotations to the new VC
*
@ -1061,4 +1172,16 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
}
return result;
}
}
/**
* Need the number of filtered genotypes samples?
*
* @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise
*/
private boolean needNumFilteredGenotypes(){
return maxFilteredGenotypes != MAX_FILTERED_GENOTYPES_DEFAULT_VALUE ||
minFilteredGenotypes != MIN_FILTERED_GENOTYPES_DEFAULT_VALUE ||
maxFractionFilteredGenotypes != MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE ||
minFractionFilteredGenotypes != MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
}
}

View File

@ -246,4 +246,20 @@ public class CatVariantsIntegrationTest {
ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cmdLine));
pc.execAndCheck(ps);
}
@Test()
public void testCatVariantsGVCFGzIndexCreation() throws IOException{
String cmdLine = String.format("java -cp \"%s\" %s -R %s -V %s -V %s -out %s",
StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar),
CatVariants.class.getCanonicalName(),
BaseTest.b37KGReference,
CatVariantsVcf1,
CatVariantsVcf2,
BaseTest.createTempFile("CatVariantsGVCFIndexCreationTest", "." + GATKVCFUtils.GVCF_GZ_EXT));
ProcessController pc = ProcessController.getThreadLocal();
ProcessSettings ps = new ProcessSettings(Utils.escapeExpressions(cmdLine));
pc.execAndCheck(ps);
}
}

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -165,7 +165,12 @@ public class SequenceDictionaryUtils {
}
case OUT_OF_ORDER: {
UserException ex = new UserException.IncompatibleSequenceDictionaries("Relative ordering of overlapping contigs differs, which is unsafe", name1, dict1, name2, dict2);
UserException ex = new UserException.IncompatibleSequenceDictionaries(
"The contig order in " + name1 + " and " + name2 + "is not "
+ "the same; to fix this please see: "
+ "(https://www.broadinstitute.org/gatk/guide/article?id=1328), "
+ " which describes reordering contigs in BAM and VCF files.",
name1, dict1, name2, dict2);
if ( allowNonFatalIncompabilities(validationExclusion) )
logger.warn(ex.getMessage());
else

View File

@ -28,7 +28,6 @@ package org.broadinstitute.gatk.utils.refdata;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.tribble.Feature;
import htsjdk.tribble.annotation.Strand;
import htsjdk.tribble.dbsnp.OldDbSNPFeature;
import htsjdk.tribble.gelitext.GeliTextFeature;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.GenomeLoc;
@ -111,139 +110,6 @@ public class VariantContextAdaptors {
}
}
// --------------------------------------------------------------------------------------------------------------
//
// dbSNP to VariantContext
//
// --------------------------------------------------------------------------------------------------------------
private static class DBSnpAdaptor implements VCAdaptor {
private static boolean isSNP(OldDbSNPFeature feature) {
return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
}
private static boolean isMNP(OldDbSNPFeature feature) {
return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range");
}
private static boolean isInsertion(OldDbSNPFeature feature) {
return feature.getVariantType().contains("insertion");
}
private static boolean isDeletion(OldDbSNPFeature feature) {
return feature.getVariantType().contains("deletion");
}
private static boolean isIndel(OldDbSNPFeature feature) {
return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature);
}
public static boolean isComplexIndel(OldDbSNPFeature feature) {
return feature.getVariantType().contains("in-del");
}
/**
* gets the alternate alleles. This method should return all the alleles present at the location,
* NOT including the reference base. This is returned as a string list with no guarantee ordering
* of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
* frequency).
*
* @return an alternate allele list
*/
public static List<String> getAlternateAlleleList(OldDbSNPFeature feature) {
List<String> ret = new ArrayList<String>();
for (String allele : getAlleleList(feature))
if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
return ret;
}
/**
* gets the alleles. This method should return all the alleles present at the location,
* including the reference base. The first allele should always be the reference allele, followed
* by an unordered list of alternate alleles.
*
* @return an alternate allele list
*/
public static List<String> getAlleleList(OldDbSNPFeature feature) {
List<String> alleleList = new ArrayList<String>();
// add ref first
if ( feature.getStrand() == Strand.POSITIVE )
alleleList = Arrays.asList(feature.getObserved());
else
for (String str : feature.getObserved())
alleleList.add(SequenceUtil.reverseComplement(str));
if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase())
&& !alleleList.get(0).equals(feature.getNCBIRefBase()) )
Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0);
return alleleList;
}
/**
* Converts non-VCF formatted dbSNP records to VariantContext.
* @return OldDbSNPFeature.
*/
@Override
public Class<? extends Feature> getAdaptableFeatureType() { return OldDbSNPFeature.class; }
@Override
public VariantContext convert(String name, Object input, ReferenceContext ref) {
OldDbSNPFeature dbsnp = (OldDbSNPFeature)input;
int index = dbsnp.getStart() - ref.getWindow().getStart() - 1;
if ( index < 0 )
return null; // we weren't given enough reference context to create the VariantContext
final byte refBaseForIndel = ref.getBases()[index];
final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-");
boolean addPaddingBase;
if ( isSNP(dbsnp) || isMNP(dbsnp) )
addPaddingBase = false;
else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") )
addPaddingBase = refBaseIsDash || GATKVariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp)));
else
return null; // can't handle anything else
Allele refAllele;
if ( refBaseIsDash )
refAllele = Allele.create(refBaseForIndel, true);
else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) )
return null;
else
refAllele = Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + dbsnp.getNCBIRefBase(), true);
final List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
// add all of the alt alleles
for ( String alt : getAlternateAlleleList(dbsnp) ) {
if ( Allele.wouldBeNullAllele(alt.getBytes()))
alt = "";
else if ( ! Allele.acceptableAlleleBases(alt) )
return null;
alleles.add(Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + alt, false));
}
final VariantContextBuilder builder = new VariantContextBuilder();
builder.source(name).id(dbsnp.getRsID());
builder.loc(dbsnp.getChr(), dbsnp.getStart() - (addPaddingBase ? 1 : 0), dbsnp.getEnd() - (addPaddingBase && refAllele.length() == 1 ? 1 : 0));
builder.alleles(alleles);
return builder.make();
}
private static List<String> stripNullDashes(final List<String> alleles) {
final List<String> newAlleles = new ArrayList<String>(alleles.size());
for ( final String allele : alleles ) {
if ( allele.equals("-") )
newAlleles.add("");
else
newAlleles.add(allele);
}
return newAlleles;
}
}
// --------------------------------------------------------------------------------------------------------------
//

View File

@ -51,10 +51,14 @@ public final class GATKVCFConstants {
public static final String SPANNING_DELETIONS_KEY = "Dels";
public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants
public static final String DOWNSAMPLED_KEY = "DS";
public static final String EVENT_COUNT_IN_HAPLOTYPE_KEY = "ECNT"; //M2
public static final String EVENT_DISTANCE_MAX_KEY = "MAX_ED"; //M2
public static final String EVENT_DISTANCE_MIN_KEY = "MIN_ED"; //M2
public static final String FISHER_STRAND_KEY = "FS";
public static final String GC_CONTENT_KEY = "GC";
public static final String GQ_MEAN_KEY = "GQ_MEAN";
public static final String GQ_STDEV_KEY = "GQ_STDDEV";
public static final String HAPLOTYPE_COUNT_KEY = "HCNT"; //M2
public static final String HAPLOTYPE_SCORE_KEY = "HaplotypeScore";
public static final String HI_CONF_DENOVO_KEY = "hiConfDeNovo";
public static final String HOMOPOLYMER_RUN_KEY = "HRun";
@ -80,8 +84,10 @@ public final class GATKVCFConstants {
public static final String ORIGINAL_CONTIG_KEY = "OriginalChr"; //LiftoverVariants
public static final String ORIGINAL_START_KEY = "OriginalStart"; //LiftoverVariants
public static final String N_BASE_COUNT_KEY = "PercentNBase";
public static final String NORMAL_LOD_KEY = "NLOD"; //M2
public static final String RBP_INCONSISTENT_KEY = "PhasingInconsistent"; //ReadBackedPhasing
public static final String GENOTYPE_PRIOR_KEY = "PG";
public static final String PANEL_OF_NORMALS_COUNT_KEY = "PON"; //M2
public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE";
public static final String QUAL_BY_DEPTH_KEY = "QD";
public static final String BEAGLE_R2_KEY = "R2"; //BeagleOutputToVCF
@ -93,11 +99,18 @@ public final class GATKVCFConstants {
public static final String STRAND_ODDS_RATIO_KEY = "SOR";
public static final String STR_PRESENT_KEY = "STR";
public static final String TRANSMISSION_DISEQUILIBRIUM_KEY = "TDT";
public static final String TUMOR_LOD_KEY = "TLOD"; //M2
public static final String VARIANT_TYPE_KEY = "VariantType";
public static final String VQS_LOD_KEY = "VQSLOD";
public static final String OXOG_ALT_F1R2_KEY = "ALT_F1R2";
public static final String OXOG_ALT_F2R1_KEY = "ALT_F2R1";
public static final String OXOG_REF_F1R2_KEY = "REF_F1R2";
public static final String OXOG_REF_F2R1_KEY = "REF_F2R1";
public static final String OXOG_FRACTION_KEY = "FOXOG";
//FORMAT keys
public static final String ALLELE_BALANCE_KEY = "AB";
public static final String ALLELE_FRACTION_KEY = "AF"; //M2
public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL";
public static final String RBP_HAPLOTYPE_KEY = "HP"; //ReadBackedPhasing
public static final String AVG_INTERVAL_DP_BY_SAMPLE_KEY = "IDP"; //DiagnoseTargets
@ -110,6 +123,7 @@ public final class GATKVCFConstants {
public static final String HAPLOTYPE_CALLER_PHASING_GT_KEY = "PGT";
public static final String HAPLOTYPE_CALLER_PHASING_ID_KEY = "PID";
public static final String PHRED_SCALED_POSTERIORS_KEY = "PP"; //FamilyLikelihoodsUtils / PosteriorLikelihoodsUtils
public static final String QUALITY_SCORE_SUM_KEY = "QSS"; //M2
public static final String REFERENCE_GENOTYPE_QUALITY = "RGQ";
public static final String STRAND_COUNT_BY_SAMPLE_KEY = "SAC";
public static final String STRAND_BIAS_BY_SAMPLE_KEY = "SB";
@ -120,13 +134,21 @@ public final class GATKVCFConstants {
/* Note that many filters used throughout GATK (most notably in VariantRecalibration) are dynamic,
their names (or descriptions) depend on some threshold. Those filters are not included here
*/
public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC";
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
public static final String ALT_ALLELE_IN_NORMAL_FILTER_NAME = "alt_allele_in_normal"; //M2
public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC";
public static final String CLUSTERED_EVENTS_FILTER_NAME = "clustered_events"; //M2
public static final String GERMLINE_RISK_FILTER_NAME = "germline_risk"; //M2
public static final String HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME = "homologous_mapping_event"; //M2
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
public static final String MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME = "multi_event_alt_allele_in_normal"; //M2
public static final String PON_FILTER_NAME = "panel_of_normals"; //M2
public static final String STR_CONTRACTION_FILTER_NAME = "str_contraction"; //M2
public static final String TUMOR_LOD_FILTER_NAME = "t_lod_fstar"; //M2
// Symbolic alleles
public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT";
public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF";
public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site
public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME = "*:DEL";
public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible spanning deletion allele at this site
public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME_DEPRECATED = "*:DEL";
public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME_DEPRECATED+">", false); // represents any possible spanning deletion allele at this site
}

View File

@ -25,16 +25,11 @@
package org.broadinstitute.gatk.utils.variant;
import htsjdk.variant.vcf.VCFFilterHeaderLine;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.*;
import static org.broadinstitute.gatk.utils.variant.GATKVCFConstants.*;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
/**
* This class contains the VCFHeaderLine definitions for the annotation keys in GATKVCFConstants.
@ -66,6 +61,16 @@ public class GATKVCFHeaderLines {
addFilterLine(new VCFFilterHeaderLine(LOW_QUAL_FILTER_NAME, "Low quality"));
addFilterLine(new VCFFilterHeaderLine(BEAGLE_MONO_FILTER_NAME, "This site was set to monomorphic by Beagle"));
// M2-related filters
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Evidence seen in the normal sample"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME, "Clustered events observed in the tumor"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME, "Evidence indicates this site is germline, not somatic"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME, "More than three events were observed in the tumor"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Multiple events observed in tumor and normal"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.PON_FILTER_NAME, "Seen in at least 2 samples in the panel of normals"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME, "Tumor does not meet likelihood threshold"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME, "Site filtered due to contraction of short tandem repeat region"));
addFormatLine(new VCFFormatHeaderLine(ALLELE_BALANCE_KEY, 1, VCFHeaderLineType.Float, "Allele balance for each het genotype"));
addFormatLine(new VCFFormatHeaderLine(MAPPING_QUALITY_ZERO_BY_SAMPLE_KEY, 1, VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample"));
addFormatLine(new VCFFormatHeaderLine(MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample"));
@ -89,6 +94,15 @@ public class GATKVCFHeaderLines {
addFormatLine(new VCFFormatHeaderLine(JOINT_POSTERIOR_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred-scaled joint posterior probability of the genotype combination (after applying family priors)"));
addFormatLine(new VCFFormatHeaderLine(ORIGINAL_GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Original Genotype input to Beagle"));
// M2-related info lines
addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.ALLELE_FRACTION_KEY, 1, VCFHeaderLineType.Float, "Allele fraction of the event in the tumor"));
addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_ALT_F1R2_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F1R2 pair orientation supporting the alternate allele"));
addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_ALT_F2R1_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F2R1 pair orientation supporting the alternate allele"));
addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_REF_F1R2_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F1R2 pair orientation supporting the reference allele"));
addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_REF_F2R1_KEY, 1, VCFHeaderLineType.Integer, "Count of reads in F2R1 pair orientation supporting the reference allele"));
addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.OXOG_FRACTION_KEY, 1, VCFHeaderLineType.Float, "Fraction of alt reads indicating OxoG error"));
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"));
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"));
addInfoLine(new VCFInfoHeaderLine(DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
@ -132,7 +146,7 @@ public class GATKVCFHeaderLines {
addInfoLine(new VCFInfoHeaderLine(ORIGINAL_DP_KEY, 1, VCFHeaderLineType.Integer, "Original DP"));
addInfoLine(new VCFInfoHeaderLine(ORIGINAL_CONTIG_KEY, 1, VCFHeaderLineType.String, "Original contig name for the record"));
addInfoLine(new VCFInfoHeaderLine(ORIGINAL_START_KEY, 1, VCFHeaderLineType.Integer, "Original start position for the record"));
addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds of being a true variant versus being false under the trained gaussian mixture model"));
addInfoLine(new VCFInfoHeaderLine(CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
addInfoLine(new VCFInfoHeaderLine(POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants"));
addInfoLine(new VCFInfoHeaderLine(NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants"));
@ -147,5 +161,15 @@ public class GATKVCFHeaderLines {
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AC_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site"));
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AF_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site"));
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AN_COMP_KEY, 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site"));
// M2-related info lines
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.String, "Number of events in this haplotype"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, 1, VCFHeaderLineType.Integer, "Maximum distance between events in this active region"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, 1, VCFHeaderLineType.Integer, "Minimum distance between events in this active region"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, 1, VCFHeaderLineType.String, "Number of haplotypes that support this variant"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.NORMAL_LOD_KEY, 1, VCFHeaderLineType.String, "Normal LOD score"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY, 1, VCFHeaderLineType.String, "Count from Panel of Normals"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_LOD_KEY, 1, VCFHeaderLineType.String, "Tumor LOD score"));
}
}

View File

@ -393,7 +393,7 @@ public class GATKVariantContextUtils {
if (repetitionCount[0] == 0 || repetitionCount[1] == 0)
return null;
if (lengths.size() == 0) {
if (lengths.isEmpty()) {
lengths.add(repetitionCount[0]); // add ref allele length only once
}
lengths.add(repetitionCount[1]); // add this alt allele's length
@ -947,7 +947,7 @@ public class GATKVariantContextUtils {
final String setKey,
final boolean filteredAreUncalled,
final boolean mergeInfoWithMaxAC ) {
if ( unsortedVCs == null || unsortedVCs.size() == 0 )
if ( unsortedVCs == null || unsortedVCs.isEmpty() )
return null;
if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size())
@ -965,7 +965,7 @@ public class GATKVariantContextUtils {
VCs.add(vc);
}
if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled
if ( VCs.isEmpty() ) // everything is filtered out and we're filteredAreUncalled
return null;
// establish the baseline info from the first VC
@ -1289,7 +1289,7 @@ public class GATKVariantContextUtils {
* @param currentAlleles the list of alleles already created
* @return a non-null mapping of original alleles to new (extended) ones
*/
private static Map<Allele, Allele> createAlleleMapping(final Allele refAllele,
protected static Map<Allele, Allele> createAlleleMapping(final Allele refAllele,
final VariantContext oneVC,
final Collection<Allele> currentAlleles) {
final Allele myRef = oneVC.getReference();
@ -1305,6 +1305,8 @@ public class GATKVariantContextUtils {
if ( extended.equals(b) )
extended = b;
map.put(a, extended);
} else if ( a.isSymbolic() ) {
map.put(a, a);
}
}
@ -1696,7 +1698,7 @@ public class GATKVariantContextUtils {
// otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list
vcList.remove(k);
// avoid having empty lists
if (vcList.size() == 0)
if (vcList.isEmpty())
mappedVCs.remove(type);
if ( !mappedVCs.containsKey(vcType) )
mappedVCs.put(vcType, new ArrayList<VariantContext>());

View File

@ -46,6 +46,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
Allele ATref;
Allele Anoref;
Allele GT;
Allele Symbolic;
private GenomeLocParser genomeLocParser;
@ -63,6 +64,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
ATref = Allele.create("AT",true);
Anoref = Allele.create("A",false);
GT = Allele.create("GT",false);
Symbolic = Allele.create("<Symbolic>", false);
genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference)));
}
@ -1607,5 +1609,25 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
BaseUtils.fillWithRandomBases(bases, 1, bases.length);
return bases;
}
@Test
public void testCreateAlleleMapping(){
final List<Allele> alleles = Arrays.asList(Aref,Symbolic,T);
final VariantContext vc = new VariantContextBuilder().chr("chr1").alleles(alleles).make();
Map<Allele, Allele> map = GATKVariantContextUtils.createAlleleMapping(ATref, vc, alleles);
final List<Allele> expectedAlleles = Arrays.asList(Allele.create("<Symbolic>", false), Allele.create("TT", false));
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ){
Assert.assertEquals(map.get(vc.getAlternateAlleles().get(i)), expectedAlleles.get(i));
}
}
@Test(expectedExceptions = IllegalStateException.class)
public void testCreateAlleleMappingException(){
final List<Allele> alleles = Arrays.asList(Aref, Symbolic, T);
final VariantContext vc = new VariantContextBuilder().chr("chr1").alleles(alleles).make();
// Throws an exception if the ref allele length <= ref allele length to extend
Map<Allele, Allele> map = GATKVariantContextUtils.createAlleleMapping(Aref, vc, alleles);
}
}

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-aggregator</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../..</relativePath>
</parent>

View File

@ -9,7 +9,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>../gatk-root</relativePath>
</parent>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.gatk</groupId>
<artifactId>gatk-root</artifactId>
<version>3.4</version>
<version>3.5-SNAPSHOT</version>
<relativePath>gatk-root</relativePath>
</parent>