CombineVariants outputs the first non-MISSING qual, not the maximum

-- When merging multiple VCF records at a site, the combined VCF record has the QUAL of the first VCF record with a non-MISSING QUAL value.  The previous behavior was to take the max QUAL, which resulted in sometime strange downstream confusion.
This commit is contained in:
Mark DePristo 2012-08-19 10:29:38 -04:00
parent 342a5b68ed
commit 9121b98167
3 changed files with 14 additions and 5 deletions

View File

@ -73,6 +73,13 @@ import java.util.*;
* efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing
* returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much.
*
* Some fine details about the merging algorithm:
* <ul>
* <li> As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of
* the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the
* max QUAL, which resulted in sometime strange downstream confusion</li>
* </ul>
*
* <h2>Input</h2>
* <p>
* One or more variant sets to combine.

View File

@ -514,7 +514,7 @@ public class VariantContextUtils {
int depth = 0;
int maxAC = -1;
final Map<String, Object> attributesWithMaxAC = new LinkedHashMap<String, Object>();
double log10PError = 1;
double log10PError = CommonInfo.NO_LOG10_PERROR;
VariantContext vcWithMaxAC = null;
GenotypesContext genotypes = GenotypesContext.create();
@ -542,7 +542,9 @@ public class VariantContextUtils {
mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY);
log10PError = Math.min(log10PError, vc.isVariant() ? vc.getLog10PError() : 1);
// We always take the QUAL of the first VC with a non-MISSING qual for the combined value
if ( log10PError == CommonInfo.NO_LOG10_PERROR )
log10PError = vc.getLog10PError();
filters.addAll(vc.getFilters());

View File

@ -113,13 +113,13 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
@Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "4efdf983918db822e4ac13d911509576"); } // official project VCF files in tabix format
@Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "848d4408ee953053d2307cefebc6bd6d"); } // official project VCF files in tabix format
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "5d04f22ef88ed9226cbd7b4483c5cb23"); }
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "629656bfef7713c23f3a593523503b2f"); }
@Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e54d0dcf14f90d5c8e58b45191dd0219"); }
@Test public void uniqueSNPs() {
// parallelism must be disabled because the input VCF is malformed (DB=0) and parallelism actually fixes this which breaks the md5s
combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "acc70f33be741b564f7be9aa3f819dd4", true);
combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "e5ea6ac3905bd9eeea1a2ef5d2cb5af7", true);
}
@Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "def52bcd3942bbe39cd7ebe845c4f206"); }
@ -137,7 +137,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
" -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
" -genotypeMergeOptions UNIQUIFY -L 1"),
1,
Arrays.asList("3039cfff7abee6aa7fbbafec66a1b019"));
Arrays.asList("e5f0e7a80cd392172ebf5ddb06b91a00"));
cvExecuteTest("threeWayWithRefs", spec, true);
}