From 069398ad46411af5787ad04a587966234587165c Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Fri, 19 Dec 2014 12:57:43 -0500 Subject: [PATCH] Added more tests and documentation --- .../tools/walkers/phasing/PhasingUtils.java | 31 ++-- .../walkers/phasing/PhasingUtilsUnitTest.java | 150 +++++++++++++++--- .../broadinstitute/gatk/utils/BaseTest.java | 1 - 3 files changed, 142 insertions(+), 40 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java index 5167b2c57..5649a2096 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java @@ -74,7 +74,7 @@ class PhasingUtils { * @param vc1 variant 1 * @param vc2 variant 2 * @param referenceFile sequence file containing the reference genome - * @param alleleMergeRule + * @param alleleMergeRule rule for merging variants * @return merged variant or null if the variants are NOT an SNP or MNP, on the same contig, variant location 1 is the same or after the variant location 2, * their genotypes do NOT have the same number of chromosomes, haplotype, number of attributes as chromosomes, are both hetrozygous or do not abide by the merge rule */ @@ -98,7 +98,7 @@ class PhasingUtils { /** * Find the alleles with the same haplotype * assumes alleleSegregationIsKnown - * TODO - should alleleSegregationIsKnown be called within the method? + * TODO - should alleleSegregationIsKnown be called within this method? * * @param gt1 genotype 1 * @param gt2 genotype 2 @@ -160,13 +160,13 @@ class PhasingUtils { } /** + * Merge variants into an MNP * * @param vc1 variant 1 * @param vc2 variant 2 * @param referenceFile sequence file containing the reference genome * @return variant with the merged MNP */ - static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { final int startInter = vc1.getEnd() + 1; final int endInter = vc2.getStart() - 1; @@ -246,25 +246,30 @@ class PhasingUtils { } /** + * Get preset attributes and whether they are in vc1 or vc2 + * TODO: Will always return an empty map because MERGE_OR_ATTRIBS is empty * - * @param vc1 - * @param vc2 - * @return merged variant attributes + * @param vc1 variant 1 + * @param vc2 variant 2 + * @return whether the preset attributes are in vc1 or vc2 */ static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { + // Map of attribute name to value Map mergedAttribs = new HashMap(); final List vcList = new LinkedList(); vcList.add(vc1); vcList.add(vc2); + // Attribute of interest //String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY}; final String[] MERGE_OR_ATTRIBS = {}; for (String orAttrib : MERGE_OR_ATTRIBS) { boolean attribVal = false; for (VariantContext vc : vcList) { + // Does the variant have the attribute? attribVal = vc.getAttributeAsBoolean(orAttrib, false); - if (attribVal) // already true, so no reason to continue: + if ( attribVal ) // already true, so no reason to continue: break; } mergedAttribs.put(orAttrib, attribVal); @@ -335,6 +340,8 @@ class PhasingUtils { // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1 for (final Genotype gt1 : vc1.getGenotypes()) { final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + if ( gt2 == null ) // gt2 does not have sample name + return false; if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom return false; @@ -385,7 +392,6 @@ class PhasingUtils { * @param vc2 variant 2 * @return true if there is a sample with alternate alleles, false otherwise */ - static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { for (final Genotype gt1 : vc1.getGenotypes()) { final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); @@ -404,13 +410,12 @@ class PhasingUtils { } /** - * Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference + * Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference) * * @param vc1 variant 1 * @param vc2 variant 2 * @return true if */ - static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { final Map allele1ToAllele2 = new HashMap(); final Map allele2ToAllele1 = new HashMap(); @@ -527,12 +532,12 @@ class PhasingUtils { private int intermediateLength; /** + * Constructor * - * @param intermediateBases + * @param intermediateBases array of bases * @param vc1 variant 1 * @param vc2 variant 2 */ - public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo this.intermediateBases = intermediateBases; @@ -545,7 +550,7 @@ class PhasingUtils { * * @param all1 allele 1 * @param all2 allele 2 - * @return + * @return merged allele */ public Allele ensureMergedAllele(Allele all1, Allele all2) { return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java index 547eccdaa..57078fcc1 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java @@ -56,8 +56,8 @@ import htsjdk.variant.variantcontext.*; import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; + import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; @@ -66,7 +66,7 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -public class PhasingUtilsUnitTest extends BaseTest { +public class PhasingUtilsUnitTest extends BaseTest { private final int start = 10; private ReferenceSequenceFile referenceFile; @@ -79,54 +79,152 @@ public class PhasingUtilsUnitTest extends BaseTest { @Test public void TestMatchHaplotypeAllelesKeyHP() { - final List alleleList = new ArrayList<>(); - alleleList.add(Allele.create("T", false)); - alleleList.add(Allele.create("C", false)); - String samelpName = "TC"; - final Genotype genotypeALT = new GenotypeBuilder(samelpName).attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList).make(); + final List alleleList1 = new ArrayList<>(); + alleleList1.add(Allele.create("T", false)); + alleleList1.add(Allele.create("C", false)); + final Genotype genotype1 = new GenotypeBuilder("TC").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList1).make(); - // Must match because the same genotype - PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotypeALT, genotypeALT); + // Attributes in different order + final List alleleList2 = new ArrayList<>(); + alleleList2.add(Allele.create("G", false)); + alleleList2.add(Allele.create("A", false)); + final Genotype genotype2 = new GenotypeBuilder("GA").attribute("HP", new String[]{"10-2", "10-1"}).alleles(alleleList2).make(); + + PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotype1, genotype2); PhasingUtils.SameHaplotypeAlleles sameHaplotypeAllelesAnswer = new PhasingUtils.SameHaplotypeAlleles(); - sameHaplotypeAllelesAnswer.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("T", false), Allele.create("T", false))); - sameHaplotypeAllelesAnswer.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("C", false), Allele.create("C", false))); + sameHaplotypeAllelesAnswer.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("T", false), Allele.create("A", false))); + sameHaplotypeAllelesAnswer.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("C", false), Allele.create("G", false))); + sameHaplotypeAllelesAnswer.requiresSwap = true; + Assert.assertEquals(sameHaplotypeAlleles.hapAlleles, sameHaplotypeAllelesAnswer.hapAlleles); Assert.assertEquals(sameHaplotypeAlleles.requiresSwap, sameHaplotypeAllelesAnswer.requiresSwap); } @Test - public void TestMatchHaplotypeAllelesNotKeyHP() { + public void TestMatchHaplotypeAllelesNoKeyHP() { - final List alleleList = new ArrayList<>(); - alleleList.add(Allele.create("T", false)); - alleleList.add(Allele.create("C", false)); - final Genotype genotypeALT = new GenotypeBuilder("TC").PL(new int[]{0, 100, 1000}).alleles(alleleList).make(); + final List alleleList1 = new ArrayList<>(); + alleleList1.add(Allele.create("T", false)); + alleleList1.add(Allele.create("C", false)); + final Genotype genotype1 = new GenotypeBuilder("TC").alleles(alleleList1).make(); + + // Attributes in different order + final List alleleList2 = new ArrayList<>(); + alleleList2.add(Allele.create("G", false)); + alleleList2.add(Allele.create("A", false)); + final Genotype genotype2 = new GenotypeBuilder("GA").alleles(alleleList2).make(); // Must match because the same genotype - PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotypeALT, genotypeALT); + PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotype1, genotype2); PhasingUtils.SameHaplotypeAlleles sameHaplotypeAllelesAnswer = new PhasingUtils.SameHaplotypeAlleles(); - sameHaplotypeAllelesAnswer.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("T", false), Allele.create("T", false))); - sameHaplotypeAllelesAnswer.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("C", false), Allele.create("C", false))); + sameHaplotypeAllelesAnswer.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("T", false), Allele.create("G", false))); + sameHaplotypeAllelesAnswer.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("C", false), Allele.create("A", false))); Assert.assertEquals(sameHaplotypeAlleles.hapAlleles, sameHaplotypeAllelesAnswer.hapAlleles); Assert.assertEquals(sameHaplotypeAlleles.requiresSwap, sameHaplotypeAllelesAnswer.requiresSwap); } @Test - public void TestMergeIntoMNPvalidationCheck() { - String source = new String("test"); - String contig = new String("1"); + public void TestMergeIntoMNPvalidationTrueCheck() { + final String contig = new String("1"); final List alleleList1 = new ArrayList<>(); alleleList1.add(Allele.create("T", true)); alleleList1.add(Allele.create("C", false)); final List alleleList2 = new ArrayList<>(); alleleList2.add(Allele.create("A", true)); - alleleList2.add(Allele.create("CC", false)); - final VariantContext variantContext1 = new VariantContextBuilder(source, contig, start, start, alleleList1).make(); - final VariantContext variantContext2 = new VariantContextBuilder(source, contig, start, start, alleleList2).make(); + alleleList2.add(Allele.create("C", false)); + final VariantContext vc1 = new VariantContextBuilder("TC", contig, start, start, alleleList1).make(); + final VariantContext vc2 = new VariantContextBuilder("AC", contig, start+1, start+1, alleleList2).make(); GenomeLocParser genomeLocParser = new GenomeLocParser(referenceFile); - Assert.assertFalse(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, variantContext1, variantContext1)); + Assert.assertTrue(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)); + } + + @Test + public void TestMergeIntoMNPvalidationCheckLocBefore() { + final String contig = new String("1"); + final List alleleList1 = new ArrayList<>(); + alleleList1.add(Allele.create("T", true)); + alleleList1.add(Allele.create("C", false)); + final List alleleList2 = new ArrayList<>(); + alleleList2.add(Allele.create("A", true)); + alleleList2.add(Allele.create("C", false)); + final VariantContext vc1 = new VariantContextBuilder("TC", contig, start+1, start+1, alleleList1).make(); + final VariantContext vc2 = new VariantContextBuilder("AC", contig, start, start, alleleList2).make(); + GenomeLocParser genomeLocParser = new GenomeLocParser(referenceFile); + + Assert.assertFalse(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)); + } + + @Test + public void TestMergeIntoMNPvalidationDiffContigs() { + final String contig1 = new String("1"); + final String contig2 = new String("2"); + final List alleleList1 = new ArrayList<>(); + alleleList1.add(Allele.create("T", true)); + alleleList1.add(Allele.create("C", false)); + final List alleleList2 = new ArrayList<>(); + alleleList2.add(Allele.create("A", true)); + alleleList2.add(Allele.create("C", false)); + final VariantContext vc1 = new VariantContextBuilder("TC", contig1, start+1, start+1, alleleList1).make(); + final VariantContext vc2 = new VariantContextBuilder("AC", contig2, start, start, alleleList2).make(); + GenomeLocParser genomeLocParser = new GenomeLocParser(referenceFile); + + Assert.assertFalse(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)); + } + + @Test + public void TestMergeVariantContextNames() { + final String result = new String("A_B"); + Assert.assertEquals(result, PhasingUtils.mergeVariantContextNames("A", "B")); + } + + @Test + public void TestMergeVariantContextAttributes() { + final String contig = new String("1"); + final List alleleList1 = new ArrayList<>(); + alleleList1.add(Allele.create("T", true)); + alleleList1.add(Allele.create("C", false)); + final List alleleList2 = new ArrayList<>(); + alleleList2.add(Allele.create("A", true)); + alleleList2.add(Allele.create("C", false)); + final VariantContext vc1 = new VariantContextBuilder("TC", contig, start, start, alleleList1).make(); + final VariantContext vc2 = new VariantContextBuilder("AC", contig, start+1, start+1, alleleList2).make(); + + Assert.assertEquals(0, PhasingUtils.mergeVariantContextAttributes(vc1, vc2).size()); + } + + @Test + public void TestAllSamplesAreMergeableDiffSampleNames() { + final String contig = new String("1"); + final List alleleList1 = new ArrayList<>(); + alleleList1.add(Allele.create("T", true)); + alleleList1.add(Allele.create("C", false)); + final List alleleList2 = new ArrayList<>(); + alleleList2.add(Allele.create("A", true)); + alleleList2.add(Allele.create("C", false)); + + final Genotype g1 = new GenotypeBuilder().name("sample1").alleles(alleleList1).make(); + final VariantContext vc1 = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(g1).make(); + Genotype g2 = new GenotypeBuilder().name("sample2").alleles(alleleList2).make(); + final VariantContext vc2 = new VariantContextBuilder().chr(contig).id("i21").source("AC").start(start).stop(start).alleles(alleleList2).genotypes(g2).make(); + + Assert.assertFalse( PhasingUtils.allSamplesAreMergeable(vc1, vc2)); + } + + @Test + public void TestAlleleSegregationIsKnown(){ + + } + + @Test + public void TestSomeSampleHasDoubleNonReferenceAllele(){ + + } + + @Test + public void TestDoubleAllelesSegregatePerfectlyAmongSamples(){ + } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java index 5689dbc4c..1669406c8 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java @@ -96,7 +96,6 @@ public abstract class BaseTest { public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; - //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; public static final String hg19RefereneWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta";