Fixing GenotypesGVCF.

Bug uncovered by some untrimmed alleles in the single sample pipeline output.

Notice however does not fix the untrimmed alleles in general.

Story:

https://www.pivotaltracker.com/story/show/65481104

Changes:

1. Fixed the bug itself.
2. Fixed non-working tests (sliently skipped due to exception in dataProvider).
This commit is contained in:
Valentin Ruano-Rubio 2014-02-14 16:34:13 -05:00
parent 6963bf6c91
commit c167fb5fdf
3 changed files with 322 additions and 294 deletions

View File

@ -65,7 +65,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("ebda39d3343b34d21490a78284ed88e8"));
Arrays.asList("9c618890c03ee9cae1d269039fc29506"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -89,7 +89,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("8eeda24a07f66d67b7639a31fda5c903"));
Arrays.asList("27f3e4700cf836c23a9af2dc1d1bbecb"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}

View File

@ -774,6 +774,9 @@ public class GATKVariantContextUtils {
return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g;
}
//TODO consider refactor variant-context merging code so that we share as much as possible between
//TODO simpleMerge and referenceConfidenceMerge
//TODO likely using a separate helper class or hierarchy.
/**
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
@ -1051,31 +1054,45 @@ public class GATKVariantContextUtils {
final String name = first.getSource();
// ref allele
final Allele refAllele = determineReferenceAlleleGiveReferenceBase(VCs, loc, refBase);
final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase);
if ( refAllele == null )
return null;
// alt alleles
final AlleleMapper alleleMapper = determineAlternateAlleleMapping(VCs, refAllele, loc);
// the allele list will not include the <NON_REF> symbolic allele, so add it if needed
if ( !removeNonRefSymbolicAllele )
alleleMapper.map.put(NON_REF_SYMBOLIC_ALLELE, NON_REF_SYMBOLIC_ALLELE);
final List<Allele> alleles = getAllelesListFromMapper(refAllele, alleleMapper);
// FinalAlleleSet contains the alleles of the new resulting VC.
// Using linked set in order to guaranteed an stable order:
final LinkedHashSet<Allele> finalAlleleSet = new LinkedHashSet<>(10);
// Reference goes first:
finalAlleleSet.add(refAllele);
final Map<String, Object> attributes = new LinkedHashMap<>();
final Set<String> rsIDs = new LinkedHashSet<>(1); // most of the time there's one id
int depth = 0;
final Map<String, List<Comparable>> annotationMap = new LinkedHashMap<>();
GenotypesContext genotypes = GenotypesContext.create();
final GenotypesContext genotypes = GenotypesContext.create();
final int variantContextCount = VCs.size();
// In this list we hold the mapping of each variant context alleles.
final List<Pair<VariantContext,List<Allele>>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount);
// cycle through and add info from the other VCs
for ( final VariantContext vc : VCs ) {
// if this context doesn't start at the current location then it must be a spanning event (deletion or ref block)
final boolean isSpanningEvent = loc.getStart() != vc.getStart();
final List<Allele> remappedAlleles = isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) : alleleMapper.remap(vc.getAlleles());
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, alleles);
vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles())
: remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet)));
}
// Add <NON_REF> to the end if at all required in in the output.
if (!removeNonRefSymbolicAllele) finalAlleleSet.add(NON_REF_SYMBOLIC_ALLELE);
final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
for ( final Pair<VariantContext,List<Allele>> pair : vcAndNewAllelePairs ) {
final VariantContext vc = pair.getFirst();
final List<Allele> remappedAlleles = pair.getSecond();
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList);
// special case DP (add it up) for all events
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) )
@ -1083,7 +1100,7 @@ public class GATKVariantContextUtils {
else if ( vc.getNSamples() == 1 && vc.getGenotype(0).hasExtendedAttribute("MIN_DP") ) // handle the gVCF case from the HaplotypeCaller
depth += vc.getGenotype(0).getAttributeAsInt("MIN_DP", 0);
if ( isSpanningEvent )
if ( loc.getStart() != vc.getStart() )
continue;
// special case ID (just preserve it)
@ -1108,8 +1125,8 @@ public class GATKVariantContextUtils {
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles)
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(alleles, loc.getStart(), loc.getStart())
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList)
.chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart())
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later
return builder.make();
@ -1123,27 +1140,13 @@ public class GATKVariantContextUtils {
* @param refBase the reference allele to use if all contexts in the VC are spanning
* @return new Allele or null if no reference allele/base is available
*/
private static Allele determineReferenceAlleleGiveReferenceBase(final List<VariantContext> VCs, final GenomeLoc loc, final Byte refBase) {
private static Allele determineReferenceAlleleGivenReferenceBase(final List<VariantContext> VCs, final GenomeLoc loc, final Byte refBase) {
final Allele refAllele = determineReferenceAllele(VCs, loc);
if ( refAllele == null )
return ( refBase == null ? null : Allele.create(refBase, true) );
return refAllele;
}
/**
* Creates an alleles list given a reference allele and a mapper
*
* @param refAllele the reference allele
* @param alleleMapper the allele mapper
* @return a non-null, non-empty list of Alleles
*/
private static List<Allele> getAllelesListFromMapper(final Allele refAllele, final AlleleMapper alleleMapper) {
final List<Allele> alleles = new ArrayList<>();
alleles.add(refAllele);
alleles.addAll(alleleMapper.getUniqueMappedAlleles());
return alleles;
}
/**
* Remove the stale attributes from the merged set
*
@ -1157,7 +1160,6 @@ public class GATKVariantContextUtils {
attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
attributes.remove(VCFConstants.END_KEY);
}
/**
* Adds attributes to the global map from the new context in a sophisticated manner
*
@ -1207,6 +1209,54 @@ public class GATKVariantContextUtils {
return it1.hasNext() || it2.hasNext();
}
//TODO as part of a larger refactoring effort remapAlleles can be merged with createAlleleMapping.
/**
* This method does a couple of things:
* <ul><li>
* remaps the vc alleles considering the differences between the final reference allele and its own reference,</li>
* <li>
* collects alternative alleles present in variant context and add them to the {@code finalAlleles} set.
* </li></ul>
*
* @param vcAlleles the variant context allele list.
* @param refAllele final reference allele.
* @param finalAlleles where to add the final set of non-ref called alleles.
* @return never {@code null}
*/
private static List<Allele> remapAlleles(final List<Allele> vcAlleles, final Allele refAllele, final LinkedHashSet<Allele> finalAlleles) {
final Allele vcRef = vcAlleles.get(0);
if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference");
final byte[] refBases = refAllele.getBases();
final int extraBaseCount = refBases.length - vcRef.getBases().length;
if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected");
final List<Allele> result = new ArrayList<>(vcAlleles.size());
for (final Allele a : vcAlleles) {
if (a.isReference()) {
result.add(refAllele);
} else if (a.isSymbolic()) {
result.add(a);
// we always skip <NON_REF> when adding to finalAlleles this is done outside if applies.
if (!a.equals(NON_REF_SYMBOLIC_ALLELE))
finalAlleles.add(a);
} else if (a.isCalled()) {
final Allele newAllele;
if (extraBaseCount > 0) {
final byte[] oldBases = a.getBases();
final byte[] newBases = Arrays.copyOf(oldBases,oldBases.length + extraBaseCount);
System.arraycopy(refBases,refBases.length - extraBaseCount,newBases,oldBases.length,extraBaseCount);
newAllele = Allele.create(newBases,false);
} else
newAllele = a;
result.add(newAllele);
finalAlleles.add(newAllele);
} else { // NO_CALL and strange miscellanea
result.add(a);
}
}
return result;
}
public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) {
final GenotypesContext newGs = GenotypesContext.create(genotypes.size());
@ -1346,50 +1396,10 @@ public class GATKVariantContextUtils {
return ref;
}
static private boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) {
public static boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) {
return loc == null || loc.getStart() == vc.getStart();
}
/**
* Given the reference allele, determines the mapping for common alternate alleles in the list of VariantContexts.
*
* @param VCs the list of VariantContexts
* @param refAllele the reference allele
* @param loc if not null, ignore records that do not begin at this start location
* @return non-null AlleleMapper
*/
static private AlleleMapper determineAlternateAlleleMapping(final List<VariantContext> VCs, final Allele refAllele, final GenomeLoc loc) {
final Map<Allele, Allele> map = new HashMap<>();
for ( final VariantContext vc : VCs ) {
if ( contextMatchesLoc(vc, loc) )
addAllAlternateAllelesToMap(vc, refAllele, map);
}
return new AlleleMapper(map);
}
/**
* Adds all of the alternate alleles from the VariantContext to the allele mapping (for use in creating the AlleleMapper)
*
* @param vc the VariantContext
* @param refAllele the reference allele
* @param map the allele mapping to populate
*/
static private void addAllAlternateAllelesToMap(final VariantContext vc, final Allele refAllele, final Map<Allele, Allele> map) {
// if the ref allele matches, then just add the alts as is
if ( refAllele.equals(vc.getReference()) ) {
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
// ignore symbolic alleles
if ( ! altAllele.isSymbolic() )
map.put(altAllele, altAllele);
}
}
else {
map.putAll(createAlleleMapping(refAllele, vc, map.values()));
}
}
static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set<Allele> allAlleles) {
if ( refAllele.equals(vc.getReference()) )
return new AlleleMapper(vc);
@ -1977,7 +1987,6 @@ public class GATKVariantContextUtils {
return new VariantContextBuilder(vc).genotypes(newGenotypes).make();
}
protected static class AlleleMapper {
private VariantContext vc = null;
private Map<Allele, Allele> map = null;

View File

@ -42,6 +42,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
private final static boolean DEBUG = false;
Allele Aref, T, C, G, Cref, ATC, ATCATC;
Allele ATCATCT;
Allele ATref;
Allele Anoref;
Allele GT;
@BeforeSuite
public void setup() {
@ -53,6 +57,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
G = Allele.create("G");
ATC = Allele.create("ATC");
ATCATC = Allele.create("ATCATC");
ATCATCT = Allele.create("ATCATCT");
ATref = Allele.create("AT",true);
Anoref = Allele.create("A",false);
GT = Allele.create("GT",false);
}
private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) {
@ -86,7 +94,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
private VariantContext makeVC(String source, List<Allele> alleles, Collection<Genotype> genotypes, Set<String> filters) {
int start = 10;
int stop = start; // alleles.contains(ATC) ? start + 3 : start;
int stop = start + alleles.get(0).length() - 1; // alleles.contains(ATC) ? start + 3 : start;
return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).make();
}
@ -138,6 +146,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
Arrays.asList(Aref, C, T));
new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T));
new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C));
new MergeAllelesTest(Arrays.asList(Aref, T, C),
@ -162,6 +171,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
Arrays.asList(Aref, ATCATC),
Arrays.asList(Aref, ATC, ATCATC));
new MergeAllelesTest(Arrays.asList(ATref, ATC, Anoref, G),
Arrays.asList(Aref, ATCATC, G),
Arrays.asList(ATref, ATC, Anoref, G, ATCATCT, GT));
return MergeAllelesTest.getTests(MergeAllelesTest.class);
}
@ -182,6 +195,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false);
Assert.assertEquals(merged.getAlleles().size(),cfg.expected.size());
Assert.assertEquals(merged.getAlleles(), cfg.expected);
}
@ -1413,224 +1427,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
//
// --------------------------------------------------------------------------------
@DataProvider(name = "generatePLsData")
public Object[][] makeGeneratePLsData() {
final List<Object[]> tests = new ArrayList<>();
for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) {
for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) {
for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) {
final int[] indexes = new int[originalAlleles];
for ( int i = 0; i < originalAlleles; i++ )
indexes[i] = i;
indexes[swapPosition1] = swapPosition2;
indexes[swapPosition2] = swapPosition1;
tests.add(new Object[]{originalAlleles, indexes});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "generatePLsData")
public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) {
final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2);
final int[] PLs = new int[numLikelihoods];
for ( int i = 0; i < numLikelihoods; i++ )
PLs[i] = i;
final List<Allele> alleles = new ArrayList<>(numOriginalAlleles);
alleles.add(Allele.create("A", true));
for ( int i = 1; i < numOriginalAlleles; i++ )
alleles.add(Allele.create(Utils.dupString('A', i + 1), false));
final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make();
final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering);
Assert.assertEquals(newPLs.length, numLikelihoods);
final int[] expectedPLs = new int[numLikelihoods];
for ( int i = 0; i < numOriginalAlleles; i++ ) {
for ( int j = i; j < numOriginalAlleles; j++ ) {
final int index = GenotypeLikelihoods.calculatePLindex(i, j);
final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]);
expectedPLs[index] = value;
}
}
for ( int i = 0; i < numLikelihoods; i++ ) {
Assert.assertEquals(newPLs[i], expectedPLs[i]);
}
}
@Test(expectedExceptions = UserException.class)
public void testGetIndexesOfRelevantAllelesWithNoALT() {
final List<Allele> alleles1 = new ArrayList<>(1);
alleles1.add(Allele.create("A", true));
final List<Allele> alleles2 = new ArrayList<>(1);
alleles2.add(Allele.create("A", true));
GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2, -1);
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
}
@DataProvider(name = "getIndexesOfRelevantAllelesData")
public Object[][] makeGetIndexesOfRelevantAllelesData() {
final int totalAlleles = 5;
final List<Allele> alleles = new ArrayList<>(totalAlleles);
alleles.add(Allele.create("A", true));
for ( int i = 1; i < totalAlleles; i++ )
alleles.add(Allele.create(Utils.dupString('A', i + 1), false));
final List<Object[]> tests = new ArrayList<>();
for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) {
tests.add(new Object[]{alleleIndex, alleles});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "getIndexesOfRelevantAllelesData")
public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List<Allele> allAlleles) {
final List<Allele> myAlleles = new ArrayList<>(3);
// always add the reference and <ALT> alleles
myAlleles.add(allAlleles.get(0));
myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
// optionally add another alternate allele
if ( allelesIndex > 0 )
myAlleles.add(allAlleles.get(allelesIndex));
final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1);
Assert.assertEquals(indexes.length, allAlleles.size());
for ( int i = 0; i < allAlleles.size(); i++ ) {
if ( i == 0 )
Assert.assertEquals(indexes[i], 0); // ref should always match
else if ( i == allelesIndex )
Assert.assertEquals(indexes[i], 2); // allele
else
Assert.assertEquals(indexes[i], 1); // <ALT>
}
}
@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {
final List<Object[]> tests = new ArrayList<>();
final int start = 10;
final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start);
final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make();
final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make();
final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73};
final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10};
final List<Allele> noCalls = new ArrayList<>(2);
noCalls.add(Allele.NO_CALL);
noCalls.add(Allele.NO_CALL);
final List<Allele> A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make();
final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make();
final Allele AAref = Allele.create("AA", true);
final List<Allele> AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make();
final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make();
final List<Allele> A_C = Arrays.asList(Aref, C);
final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make();
final List<Allele> A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make();
final List<Allele> A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make();
final List<Allele> A_C_G = Arrays.asList(Aref, C, G);
final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make();
final List<Allele> A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make();
final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make();
final List<Allele> A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make();
final Allele A = Allele.create("A", false);
final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make();
// first test the case of a single record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).alleles(noCalls).make()});
// now, test pairs:
// a SNP with another SNP
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()});
// a SNP with an indel
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ATC_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()});
// a SNP with 2 SNPs
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_C_G_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()});
// a SNP with a ref record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()});
// spanning records:
// a SNP with a spanning ref record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()});
// a SNP with a spanning deletion
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_A_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()});
// combination of all
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC, G)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).make(),
new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 71, 72, 74, 20, 10, 73, 30}).make(),
new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).make(),
new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).make()).make()});
// just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
tests.add(new Object[]{Arrays.asList(vcAA_ALT),
loc, false,
null});
tests.add(new Object[]{Arrays.asList(vcAA_ALT),
loc, true,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).make()).make()});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "referenceConfidenceMergeData")
public void testReferenceConfidenceMerge(final List<VariantContext> toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) {
final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte)'A' : null, true);
if ( result == null ) {
Assert.assertTrue(expectedResult == null);
return;
}
Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles());
Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples());
for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) {
Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName());
// use string comparisons to test equality for now
Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString());
}
}
@Test(dataProvider = "indexOfAlleleData")
public void testIndexOfAllele(final Allele reference, final List<Allele> altAlleles, final List<Allele> otherAlleles) {
@ -1709,14 +1505,237 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
};
}
@Test(dataProvider = "generatePLsData")
public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) {
final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2);
final int[] PLs = new int[numLikelihoods];
for ( int i = 0; i < numLikelihoods; i++ )
PLs[i] = i;
final List<Allele> alleles = new ArrayList<>(numOriginalAlleles);
alleles.add(Allele.create("A", true));
for ( int i = 1; i < numOriginalAlleles; i++ )
alleles.add(Allele.create(Utils.dupString('A', i + 1), false));
final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make();
final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering);
Assert.assertEquals(newPLs.length, numLikelihoods);
final int[] expectedPLs = new int[numLikelihoods];
for ( int i = 0; i < numOriginalAlleles; i++ ) {
for ( int j = i; j < numOriginalAlleles; j++ ) {
final int index = GenotypeLikelihoods.calculatePLindex(i, j);
final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]);
expectedPLs[index] = value;
}
}
for ( int i = 0; i < numLikelihoods; i++ ) {
Assert.assertEquals(newPLs[i], expectedPLs[i]);
}
}
@Test(dataProvider = "referenceConfidenceMergeData")
public void testReferenceConfidenceMerge(final String testID, final List<VariantContext> toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) {
final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true);
if ( result == null ) {
Assert.assertTrue(expectedResult == null);
return;
}
Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles(),testID);
Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples(),testID);
for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) {
Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName());
// use string comparisons to test equality for now
Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString());
}
}
@Test
public void testGenerateADWithNewAlleles() {
final int[] originalAD = new int[] {1,2,0};
final int[] indexesOfRelevantAlleles = new int[] {0,1,2,2};
final int[] newAD = GATKVariantContextUtils.generateAD(originalAD, indexesOfRelevantAlleles);
Assert.assertEquals(newAD, new int[]{1,2,0,0});
}
@Test(expectedExceptions = UserException.class)
public void testGetIndexesOfRelevantAllelesWithNoALT() {
final List<Allele> alleles1 = new ArrayList<>(1);
alleles1.add(Allele.create("A", true));
final List<Allele> alleles2 = new ArrayList<>(1);
alleles2.add(Allele.create("A", true));
GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2, -1);
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
}
@Test(dataProvider = "getIndexesOfRelevantAllelesData")
public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List<Allele> allAlleles) {
final List<Allele> myAlleles = new ArrayList<>(3);
// always add the reference and <ALT> alleles
myAlleles.add(allAlleles.get(0));
myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
// optionally add another alternate allele
if ( allelesIndex > 0 )
myAlleles.add(allAlleles.get(allelesIndex));
final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1);
Assert.assertEquals(indexes.length, allAlleles.size());
for ( int i = 0; i < allAlleles.size(); i++ ) {
if ( i == 0 )
Assert.assertEquals(indexes[i], 0); // ref should always match
else if ( i == allelesIndex )
Assert.assertEquals(indexes[i], 2); // allele
else
Assert.assertEquals(indexes[i], 1); // <ALT>
}
}
@DataProvider(name = "getIndexesOfRelevantAllelesData")
public Object[][] makeGetIndexesOfRelevantAllelesData() {
final int totalAlleles = 5;
final List<Allele> alleles = new ArrayList<>(totalAlleles);
alleles.add(Allele.create("A", true));
for ( int i = 1; i < totalAlleles; i++ )
alleles.add(Allele.create(Utils.dupString('A', i + 1), false));
final List<Object[]> tests = new ArrayList<>();
for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) {
tests.add(new Object[]{alleleIndex, alleles});
}
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {
final List<Object[]> tests = new ArrayList<>();
final int start = 10;
final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start);
final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make();
final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make();
final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73};
final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10};
final List<Allele> noCalls = new ArrayList<>(2);
noCalls.add(Allele.NO_CALL);
noCalls.add(Allele.NO_CALL);
final List<Allele> A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make();
final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make();
final Allele AAref = Allele.create("AA", true);
final List<Allele> AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make();
final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make();
final List<Allele> A_C = Arrays.asList(Aref, C);
final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make();
final List<Allele> A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make();
final List<Allele> A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make();
final List<Allele> A_C_G = Arrays.asList(Aref, C, G);
final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make();
final List<Allele> A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make();
final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make();
final List<Allele> A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make();
final Allele A = Allele.create("A", false);
final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make();
// first test the case of a single record
tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()});
// now, test pairs:
// a SNP with another SNP
tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()});
// a SNP with an indel
tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()});
// a SNP with 2 SNPs
tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()});
// a SNP with a ref record
tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()});
// spanning records:
// a SNP with a spanning ref record
tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()});
// a SNP with a spanning deletion
tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()});
// combination of all
tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(),
new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(),
new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(),
new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()});
// just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT),
loc, false,
null});
tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT),
loc, true,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()});
final Object[][] result = tests.toArray(new Object[][]{});
return result;
}
@DataProvider(name = "generatePLsData")
public Object[][] makeGeneratePLsData() {
final List<Object[]> tests = new ArrayList<>();
for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) {
for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) {
for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) {
final int[] indexes = new int[originalAlleles];
for ( int i = 0; i < originalAlleles; i++ )
indexes[i] = i;
indexes[swapPosition1] = swapPosition2;
indexes[swapPosition2] = swapPosition1;
tests.add(new Object[]{originalAlleles, indexes});
}
}
}
return tests.toArray(new Object[][]{});
}
}