Merge pull request #275 from broadinstitute/md_fragment_with_pcr

Improvements to HaplotypeCaller and NA12878 KB
This commit is contained in:
Ryan Poplin 2013-06-14 09:32:26 -07:00
commit c4e508a71f
9 changed files with 394 additions and 94 deletions

View File

@ -106,7 +106,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles()));
builder.alleles(alleles);
builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL));
return builder.make();
} else {
return vc;
@ -352,6 +352,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
return allelesToUse.size() == 1
? GATKVariantContextUtils.subsetToRefOnly(vc, ploidy)
: GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse,
assignGenotypes ? GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN : GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL);
}
}

View File

@ -919,19 +919,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
final List<GATKSAMRecord> finalizedReadList = new ArrayList<>();
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
activeRegion.clearReads();
// Join overlapping paired reads to create a single longer read
finalizedReadList.addAll( fragmentCollection.getSingletonReads() );
for( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) );
}
// Loop through the reads hard clipping the adaptor and low quality tails
final List<GATKSAMRecord> readsToUse = new ArrayList<>(finalizedReadList.size());
for( final GATKSAMRecord myRead : finalizedReadList ) {
final List<GATKSAMRecord> readsToUse = new ArrayList<>(activeRegion.getReads().size());
for( final GATKSAMRecord myRead : activeRegion.getReads() ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
GATKSAMRecord clippedRead;
@ -962,6 +953,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
}
activeRegion.clearReads();
activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
}

View File

@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("294183823d678d3668f4fa98b4de6e06"));
Arrays.asList("facac578891a4f2be63ddd5ba6b9096b"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}

View File

@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("5e8f1fa88dc93320cc0e75e9fe6e153b"));
Arrays.asList("474dfb943a307c86cabe2043970c58f3"));
executeTest("test MultiSample Pilot1", spec);
}
@ -80,7 +80,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("60115af273fde49c76d4df6c9c0f6501"));
Arrays.asList("3e646003c5b93da80c7d8e5d0ff2ee4e"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "e7b28ea087e8624f1e596c9d65381fea");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "03944bbedb012e2ac2026a84baa0560c");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -94,6 +94,6 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"2a72a9b5c6778b99bf155a7c5e90d11e");
"7e9f99d4cba8087dac66ea871b910d7e");
}
}

View File

@ -78,12 +78,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "f25b9cfc85995cbe8eb6ba5a126d713d");
HCTest(CEUTRIO_BAM, "", "09d84bc1aef2dd9c185934752172b794");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "19d685727ec60b3568f313bc44f79b49");
HCTest(NA12878_BAM, "", "5c074930b27d1f5c942fe755c2a8be27");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -94,7 +94,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"6da65f1d396b9c709eb6246cf3f615c1");
"005a6d1933913a5d96fc56d01303fa95");
}
@Test
@ -110,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e3db7d56154e36eeb887259bea4b241d");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "9b6f667ad87e19c38d16fefe63c37484");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -186,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("40416433baf96f4e84a058459717060b"));
Arrays.asList("a47ef09a8701128cfb301a83b7bb0728"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -194,7 +194,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("cf1461ce829023ea9920fbfeb534eb97"));
Arrays.asList("0cb99f6bb3e630add4b3486c496fa508"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("45ca324be3917655e645d6c290c9280f"));
Arrays.asList("92f947cc89e4f50cf2ef3121d2fe308d"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -217,7 +217,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("b7037770b7953cdf858764b99fa243ed"));
Arrays.asList("91877c8ea3eb0e0316d9ad11fdcc1a87"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
}

View File

@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "bd2a57e6b0cffb4cbdba609a6c1683dc"});
tests.add(new Object[]{nct, "9da4cc89590c4c64a36f4a9c820f8609"});
}
return tests.toArray(new Object[][]{});

View File

@ -45,7 +45,7 @@ public class GATKVariantContextUtils {
public static final int DEFAULT_PLOIDY = 2;
public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
protected static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
public final static String MERGE_FILTER_PREFIX = "filterIn";
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
public final static String MERGE_FILTER_IN_ALL = "FilteredInAll";
@ -421,6 +421,37 @@ public class GATKVariantContextUtils {
return true; // we passed all tests, we matched
}
public enum GenotypeAssignmentMethod {
/**
* set all of the genotype GT values to NO_CALL
*/
SET_TO_NO_CALL,
/**
* Use the subsetted PLs to greedily assigned genotypes
*/
USE_PLS_TO_ASSIGN,
/**
* Try to match the original GT calls, if at all possible
*
* Suppose I have 3 alleles: A/B/C and the following samples:
*
* original_GT best_match to A/B best_match to A/C
* S1 => A/A A/A A/A
* S2 => A/B A/B A/A
* S3 => B/B B/B A/A
* S4 => B/C A/B A/C
* S5 => C/C A/A C/C
*
* Basically, all alleles not in the subset map to ref. It means that het-alt genotypes
* when split into 2 bi-allelic variants will be het in each, which is good in some cases,
* rather than the undetermined behavior when using the PLs to assign, which could result
* in hom-var or hom-ref for each, depending on the exact PL values.
*/
BEST_MATCH_TO_ORIGINAL
}
/**
* subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately)
*
@ -430,22 +461,23 @@ public class GATKVariantContextUtils {
* @return genotypes
*/
public static GenotypesContext subsetDiploidAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes) {
final List<Allele> allelesToUse,
final GenotypeAssignmentMethod assignGenotypes) {
if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele");
if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele");
// the genotypes with PLs
final GenotypesContext oldGTs = vc.getGenotypes();
// the new genotypes to create
final GenotypesContext newGTs = GenotypesContext.create();
// optimization: if no input genotypes, just exit
if (oldGTs.isEmpty())
return newGTs;
if (oldGTs.isEmpty()) return newGTs;
// samples
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2);
@ -456,8 +488,8 @@ public class GATKVariantContextUtils {
// an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles,
// then we can keep the PLs as is; otherwise, we determine which ones to keep
if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) {
likelihoodIndexesToUse = new ArrayList<Integer>(30);
if ( numNewAltAlleles != numOriginalAltAlleles ) {
likelihoodIndexesToUse = new ArrayList<>(30);
final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles];
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
@ -478,55 +510,127 @@ public class GATKVariantContextUtils {
// create the new genotypes
for ( int k = 0; k < oldGTs.size(); k++ ) {
final Genotype g = oldGTs.get(sampleIndices.get(k));
if ( !g.hasLikelihoods() ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
continue;
}
final GenotypeBuilder gb = new GenotypeBuilder(g);
// create the new likelihoods array from the alleles we are allowed to use
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
double[] newLikelihoods;
if ( likelihoodIndexesToUse == null ) {
newLikelihoods = originalLikelihoods;
} else if ( originalLikelihoods.length != expectedNumLikelihoods ) {
logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods);
if ( !g.hasLikelihoods() ) {
// we don't have any likelihoods, so we null out PLs and make G ./.
newLikelihoods = null;
gb.noPL();
} else {
newLikelihoods = new double[likelihoodIndexesToUse.size()];
int newIndex = 0;
for ( int oldIndex : likelihoodIndexesToUse )
newLikelihoods[newIndex++] = originalLikelihoods[oldIndex];
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
if ( likelihoodIndexesToUse == null ) {
newLikelihoods = originalLikelihoods;
} else if ( originalLikelihoods.length != expectedNumLikelihoods ) {
logger.warn("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods);
newLikelihoods = null;
} else {
newLikelihoods = new double[likelihoodIndexesToUse.size()];
int newIndex = 0;
for ( int oldIndex : likelihoodIndexesToUse )
newLikelihoods[newIndex++] = originalLikelihoods[oldIndex];
// might need to re-normalize
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
}
// might need to re-normalize
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
}
// if there is no mass on the (new) likelihoods, then just no-call the sample
if ( newLikelihoods != null && MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
}
else {
final GenotypeBuilder gb = new GenotypeBuilder(g);
if ( newLikelihoods == null || numNewAltAlleles == 0 )
if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) )
gb.noPL();
else
gb.PL(newLikelihoods);
// if we weren't asked to assign a genotype, then just no-call the sample
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) {
gb.alleles(NO_CALL_ALLELES);
}
else {
// find the genotype with maximum likelihoods
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2)));
if ( numNewAltAlleles != 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
}
newGTs.add(gb.make());
}
updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse);
newGTs.add(gb.make());
}
return newGTs;
}
private static boolean likelihoodsAreUninformative(final double[] likelihoods) {
return MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL;
}
/**
* Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod
*
* @param originalGT the original genotype calls, cannot be null
* @param gb the builder where we should put our newly called alleles, cannot be null
* @param assignmentMethod the method to use to do the assignment, cannot be null
* @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null
* @param allelesToUse the alleles we are using for our subsetting
*/
protected static void updateGenotypeAfterSubsetting(final List<Allele> originalGT,
final GenotypeBuilder gb,
final GenotypeAssignmentMethod assignmentMethod,
final double[] newLikelihoods,
final List<Allele> allelesToUse) {
gb.noAD();
switch ( assignmentMethod ) {
case SET_TO_NO_CALL:
gb.alleles(NO_CALL_ALLELES);
gb.noGQ();
break;
case USE_PLS_TO_ASSIGN:
if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) {
// if there is no mass on the (new) likelihoods, then just no-call the sample
gb.alleles(NO_CALL_ALLELES);
gb.noGQ();
} else {
// find the genotype with maximum likelihoods
final int PLindex = MathUtils.maxElementIndex(newLikelihoods);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2)));
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
}
break;
case BEST_MATCH_TO_ORIGINAL:
final List<Allele> best = new LinkedList<>();
final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument
for ( final Allele originalAllele : originalGT ) {
best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref);
}
gb.noGQ();
gb.noPL();
gb.alleles(best);
break;
}
}
/**
* Subset the samples in VC to reference only information with ref call alleles
*
* Preserves DP if present
*
* @param vc the variant context to subset down to
* @param ploidy ploidy to use if a genotype doesn't have any alleles
* @return a GenotypesContext
*/
public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int ploidy) {
if ( vc == null ) throw new IllegalArgumentException("vc cannot be null");
if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be >= 1 but got " + ploidy);
// the genotypes with PLs
final GenotypesContext oldGTs = vc.getGenotypes();
// optimization: if no input genotypes, just exit
if (oldGTs.isEmpty()) return oldGTs;
// the new genotypes to create
final GenotypesContext newGTs = GenotypesContext.create();
final Allele ref = vc.getReference();
final List<Allele> diploidRefAlleles = Arrays.asList(ref, ref);
// create the new genotypes
for ( final Genotype g : vc.getGenotypes() ) {
final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy();
final List<Allele> refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref);
final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles);
if ( g.hasDP() ) gb.DP(g.getDP());
if ( g.hasGQ() ) gb.GQ(g.getGQ());
newGTs.add(gb.make());
}
return newGTs;
@ -539,7 +643,7 @@ public class GATKVariantContextUtils {
* @return genotypes context
*/
public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) {
return subsetDiploidAlleles(vc, vc.getAlleles(), true);
return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN);
}
/**
@ -557,7 +661,7 @@ public class GATKVariantContextUtils {
* @return a list of bi-allelic (or monomorphic) variant context
*/
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc) {
return splitVariantContextToBiallelics(vc, false);
return splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.SET_TO_NO_CALL);
}
/**
@ -575,18 +679,18 @@ public class GATKVariantContextUtils {
* @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome
* @return a list of bi-allelic (or monomorphic) variant context
*/
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft) {
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) {
if ( ! vc.isVariant() || vc.isBiallelic() )
// non variant or biallelics already satisfy the contract
return Collections.singletonList(vc);
else {
final List<VariantContext> biallelics = new LinkedList<VariantContext>();
final List<VariantContext> biallelics = new LinkedList<>();
for ( final Allele alt : vc.getAlternateAlleles() ) {
VariantContextBuilder builder = new VariantContextBuilder(vc);
final List<Allele> alleles = Arrays.asList(vc.getReference(), alt);
builder.alleles(alleles);
builder.genotypes(subsetDiploidAlleles(vc, alleles, false));
builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod));
VariantContextUtils.calculateChromosomeCounts(builder, true);
final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true);
biallelics.add(trimmed);

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.variant;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.variant.variantcontext.*;
@ -39,6 +40,7 @@ import org.testng.annotations.Test;
import java.util.*;
public class GATKVariantContextUtilsUnitTest extends BaseTest {
private final static boolean DEBUG = false;
Allele Aref, T, C, G, Cref, ATC, ATCATC;
@ -168,7 +170,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return MergeAllelesTest.getTests(MergeAllelesTest.class);
}
@Test(dataProvider = "mergeAlleles")
@Test(enabled = !DEBUG, dataProvider = "mergeAlleles")
public void testMergeAlleles(MergeAllelesTest cfg) {
final List<VariantContext> inputs = new ArrayList<VariantContext>();
@ -229,7 +231,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return SimpleMergeRSIDTest.getTests(SimpleMergeRSIDTest.class);
}
@Test(dataProvider = "simplemergersiddata")
@Test(enabled = !DEBUG, dataProvider = "simplemergersiddata")
public void testRSIDMerge(SimpleMergeRSIDTest cfg) {
VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T));
final List<VariantContext> inputs = new ArrayList<VariantContext>();
@ -352,7 +354,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return MergeFilteredTest.getTests(MergeFilteredTest.class);
}
@Test(dataProvider = "mergeFiltered")
@Test(enabled = !DEBUG, dataProvider = "mergeFiltered")
public void testMergeFiltered(MergeFilteredTest cfg) {
final List<String> priority = vcs2priority(cfg.inputs);
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
@ -479,7 +481,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return MergeGenotypesTest.getTests(MergeGenotypesTest.class);
}
@Test(dataProvider = "mergeGenotypes")
@Test(enabled = !DEBUG, dataProvider = "mergeGenotypes")
public void testMergeGenotypes(MergeGenotypesTest cfg) {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
@ -517,7 +519,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
}
}
@Test
@Test(enabled = !DEBUG)
public void testMergeGenotypesUniquify() {
final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
@ -547,7 +549,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
//
// --------------------------------------------------------------------------------
@Test
@Test(enabled = !DEBUG)
public void testAnnotationSet() {
for ( final boolean annotate : Arrays.asList(true, false)) {
for ( final String set : Arrays.asList("set", "combine", "x")) {
@ -618,7 +620,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class);
}
@Test(dataProvider = "ReverseClippingPositionTestProvider")
@Test(enabled = !DEBUG, dataProvider = "ReverseClippingPositionTestProvider")
public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) {
int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes());
Assert.assertEquals(result, cfg.expectedClip);
@ -706,7 +708,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "SplitBiallelics")
@Test(enabled = !DEBUG, dataProvider = "SplitBiallelics")
public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
final List<VariantContext> biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
Assert.assertEquals(biallelics.size(), expectedBiallelics.size());
@ -717,7 +719,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
}
}
@Test(dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes")
@Test(enabled = !DEBUG, dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes")
public void testSplitBiallelicsGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
final List<Genotype> genotypes = new ArrayList<Genotype>();
@ -745,7 +747,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
}
}
// --------------------------------------------------------------------------------
//
// Test repeats
@ -810,14 +811,14 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return RepeatDetectorTest.getTests(RepeatDetectorTest.class);
}
@Test(dataProvider = "RepeatDetectorTest")
@Test(enabled = !DEBUG, dataProvider = "RepeatDetectorTest")
public void testRepeatDetectorTest(RepeatDetectorTest cfg) {
// test alleles are equal
Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat);
}
@Test
@Test(enabled = !DEBUG)
public void testRepeatAllele() {
Allele nullR = Allele.create("A", true);
Allele nullA = Allele.create("A", false);
@ -940,7 +941,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ForwardClippingData")
@Test(enabled = !DEBUG, dataProvider = "ForwardClippingData")
public void testForwardClipping(final List<String> alleleStrings, final int expectedClip) {
final List<Allele> alleles = new LinkedList<Allele>();
for ( final String alleleString : alleleStrings )
@ -975,7 +976,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ClipAlleleTest")
@Test(enabled = !DEBUG, dataProvider = "ClipAlleleTest")
public void testClipAlleles(final List<String> alleleStrings, final List<String> expected, final int numLeftClipped) {
final int start = 10;
final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings);
@ -1019,7 +1020,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "PrimitiveAlleleSplittingData")
@Test(enabled = !DEBUG, dataProvider = "PrimitiveAlleleSplittingData")
public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List<Integer> variantPositions) {
final int start = 10;
@ -1066,7 +1067,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "AlleleRemappingData")
@Test(enabled = !DEBUG, dataProvider = "AlleleRemappingData")
public void testAlleleRemapping(final Map<Allele, Allele> alleleMap, final int numGenotypes) {
final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap);
@ -1102,4 +1103,204 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
return gc;
}
// --------------------------------------------------------------------------------
//
// Test subsetDiploidAlleles
//
// --------------------------------------------------------------------------------
@DataProvider(name = "subsetDiploidAllelesData")
public Object[][] makesubsetDiploidAllelesData() {
List<Object[]> tests = new ArrayList<>();
final Allele A = Allele.create("A", true);
final Allele C = Allele.create("C");
final Allele G = Allele.create("G");
final List<Allele> AA = Arrays.asList(A,A);
final List<Allele> AC = Arrays.asList(A,C);
final List<Allele> CC = Arrays.asList(C,C);
final List<Allele> AG = Arrays.asList(A,G);
final List<Allele> CG = Arrays.asList(C,G);
final List<Allele> GG = Arrays.asList(G,G);
final List<Allele> ACG = Arrays.asList(A,C,G);
final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make();
final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01});
final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01});
final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9});
final double[] uninformative = new double[]{0, 0, 0};
final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make();
// make sure we don't screw up the simple case
final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make();
final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make();
final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make();
tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).noAD().make())});
tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).noAD().make())});
tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).noAD().make())});
// uninformative test case
final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make();
final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make();
tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)});
// actually subsetting down from multiple alt values
final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50};
final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50};
final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50};
final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50};
final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG
final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG
tests.add(new Object[]{
new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homRef3AllelesPL).make()).make(),
AC,
Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).noAD().GQ(100).make())});
tests.add(new Object[]{
new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetRefC3AllelesPL).make()).make(),
AC,
Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).noAD().GQ(100).make())});
tests.add(new Object[]{
new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homC3AllelesPL).make()).make(),
AC,
Arrays.asList(new GenotypeBuilder(base).alleles(CC).PL(new double[]{-20, -10, 0}).noAD().GQ(100).make())});
tests.add(new Object[]{
new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetRefG3AllelesPL).make()).make(),
AG,
Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).noAD().GQ(200).make())});
// wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done
tests.add(new Object[]{
new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(hetCG3AllelesPL).make()).make(),
AG,
Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).noAD().GQ(200).make())});
tests.add(new Object[]{
new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).noAD().PL(homG3AllelesPL).make()).make(),
AG,
Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).noAD().GQ(200).make())});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "subsetDiploidAllelesData")
public void testsubsetDiploidAllelesData(final VariantContext inputVC,
final List<Allele> allelesToUse,
final List<Genotype> expectedGenotypes) {
final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN);
Assert.assertEquals(actual.size(), expectedGenotypes.size());
for ( final Genotype expected : expectedGenotypes ) {
final Genotype actualGT = actual.get(expected.getSampleName());
Assert.assertNotNull(actualGT);
assertGenotypesAreEqual(actualGT, expected);
}
}
@DataProvider(name = "UpdateGenotypeAfterSubsettingData")
public Object[][] makeUpdateGenotypeAfterSubsettingData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Allele A = Allele.create("A", true);
final Allele C = Allele.create("C");
final Allele G = Allele.create("G");
final List<Allele> AA = Arrays.asList(A,A);
final List<Allele> AC = Arrays.asList(A,C);
final List<Allele> CC = Arrays.asList(C,C);
final List<Allele> AG = Arrays.asList(A,G);
final List<Allele> CG = Arrays.asList(C,G);
final List<Allele> GG = Arrays.asList(G,G);
final List<Allele> ACG = Arrays.asList(A,C,G);
final List<List<Allele>> allSubsetAlleles = Arrays.asList(AC,AG,ACG);
final double[] homRefPL = new double[]{0.9, 0.09, 0.01};
final double[] hetPL = new double[]{0.09, 0.9, 0.01};
final double[] homVarPL = new double[]{0.01, 0.09, 0.9};
final double[] uninformative = new double[]{0.33, 0.33, 0.33};
final List<double[]> allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative);
for ( final List<Allele> alleles : allSubsetAlleles ) {
for ( final double[] pls : allPLs ) {
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES});
}
}
for ( final List<Allele> originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) {
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC});
// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES});
}
for ( final double[] pls : allPLs ) {
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG});
tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG});
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData")
public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode,
final double[] likelihoods,
final List<Allele> originalGT,
final List<Allele> allelesToUse,
final List<Allele> expectedAlleles) {
final GenotypeBuilder gb = new GenotypeBuilder("test");
final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false);
GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse);
final Genotype g = gb.make();
Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles));
}
@Test(enabled = !DEBUG)
public void testSubsetToRef() {
final Map<Genotype, Genotype> tests = new LinkedHashMap<>();
for ( final List<Allele> alleles : Arrays.asList(Arrays.asList(Aref), Arrays.asList(C), Arrays.asList(Aref, C), Arrays.asList(Aref, C, C) ) ) {
for ( final String name : Arrays.asList("test1", "test2") ) {
final GenotypeBuilder builder = new GenotypeBuilder(name, alleles);
builder.DP(10);
builder.GQ(30);
builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3}));
builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3}));
final List<Allele> refs = Collections.nCopies(alleles.size(), Aref);
tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make());
}
}
for ( final int n : Arrays.asList(1, 2, 3) ) {
for ( final List<Genotype> genotypes : Utils.makePermutations(new ArrayList<>(tests.keySet()), n, false) ) {
final VariantContext vc = new VariantContextBuilder("test", "20", 1, 1, Arrays.asList(Aref, C)).genotypes(genotypes).make();
final GenotypesContext gc = GATKVariantContextUtils.subsetToRefOnly(vc, 2);
Assert.assertEquals(gc.size(), genotypes.size());
for ( int i = 0; i < genotypes.size(); i++ ) {
// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i)));
assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i)));
}
}
}
}
}