diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java index e9bcb0eea..43f6376eb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java @@ -71,8 +71,8 @@ class PhasingUtils { * Merge variants into an MNP * * @param genomeLocParser parse the genome locations - * @param vc1 variant 1 - * @param vc2 variant 2 + * @param vc1 variant context 1 + * @param vc2 variant context 2 * @param referenceFile sequence file containing the reference genome * @param alleleMergeRule rule for merging variants * @return merged variant or null if the variants are NOT an SNP or MNP, on the same contig, variant location 1 is the same or after the variant location 2, @@ -162,8 +162,8 @@ class PhasingUtils { /** * Merge variants into an MNP * - * @param vc1 variant 1 - * @param vc2 variant 2 + * @param vc1 variant context 1 + * @param vc2 variant context 2 * @param referenceFile sequence file containing the reference genome * @return variant with the merged MNP */ @@ -171,16 +171,21 @@ class PhasingUtils { final int startInter = vc1.getEnd() + 1; final int endInter = vc2.getStart() - 1; byte[] intermediateBases = null; + + // get bases between vc1 and vc2 in the reference sequence if (startInter <= endInter) { intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); StringUtil.toUpperCase(intermediateBases); } + + // merge the reference bases with vc1 and vc2 final MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added final GenotypesContext mergedGenotypes = GenotypesContext.create(); for (final Genotype gt1 : vc1.getGenotypes()) { final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + // Alleles with the same haplotype final SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2); boolean isPhased = gt1.isPhased() && gt2.isPhased(); @@ -197,6 +202,7 @@ class PhasingUtils { final Map mergedGtAttribs = new HashMap(); + // get the min read backed phasing quality double PQ = Double.MAX_VALUE; if (gt1.hasAnyAttribute(ReadBackedPhasing.PQ_KEY)) { PQ = Math.min(PQ, (double) gt1.getAnyAttribute(ReadBackedPhasing.PQ_KEY)); @@ -207,6 +213,7 @@ class PhasingUtils { if (PQ != Double.MAX_VALUE) mergedGtAttribs.put(ReadBackedPhasing.PQ_KEY, PQ); + // get the read backed phasing phasing haplotype identifier if (gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) { mergedGtAttribs.put(ReadBackedPhasing.HP_KEY, gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY)); } @@ -214,21 +221,24 @@ class PhasingUtils { mergedGtAttribs.put(ReadBackedPhasing.HP_KEY, gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY)); } + // make the merged genotype final Genotype mergedGt = new GenotypeBuilder(gt1.getSampleName(), mergedAllelesForSample).log10PError(mergedGQ).attributes(mergedGtAttribs).phased(isPhased).make(); mergedGenotypes.add(mergedGt); } + // get the merged name final String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource()); final double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError()); final Set mergedFilters = new HashSet(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered final Map mergedAttribs = mergeVariantContextAttributes(vc1, vc2); - // ids + // get the merged ID final List mergedIDs = new ArrayList(); if ( vc1.hasID() ) mergedIDs.add(vc1.getID()); if ( vc2.hasID() ) mergedIDs.add(vc2.getID()); final String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs); + // make the merged variant context final VariantContextBuilder mergedBuilder = new VariantContextBuilder(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()).id(mergedID).genotypes(mergedGenotypes).log10PError(mergedLog10PError).filters(mergedFilters).attributes(mergedAttribs); VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true); return mergedBuilder.make(); @@ -237,8 +247,8 @@ class PhasingUtils { /** * Merge two variant context names together with an underscore betwwen them * - * @param name1 variant 1 name - * @param name2 variant 2 name + * @param name1 variant context 1 name + * @param name2 variant context 2 name * @return combined variant names */ static String mergeVariantContextNames(String name1, String name2) { @@ -249,8 +259,8 @@ class PhasingUtils { * Get preset attributes and whether they are in vc1 or vc2 * TODO: Will always return an empty map because MERGE_OR_ATTRIBS is empty * - * @param vc1 variant 1 - * @param vc2 variant 2 + * @param vc1 variant context 1 + * @param vc2 variant context 2 * @return whether the preset attributes are in vc1 or vc2 */ static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { @@ -282,8 +292,8 @@ class PhasingUtils { * Check if variants can be merged into the MNP * * @param genomeLocParser parse the genome locations - * @param vc1 variant 1 - * @param vc2 variant 2 + * @param vc1 variant context 1 + * @param vc2 variant context 2 * @return true if variants are an SNP or MNP, on the same contig, variant location 1 is not before variant location 2, unfiltered, from the same sample set and are called, * false otherwise */ @@ -301,7 +311,7 @@ class PhasingUtils { if (!loc1.onSameContig(loc2)) return false; - // Variant 1 location must not be before variant 2 + // Variant 1 location must not be before variant context 2 if (!loc1.isBefore(loc2)) return false; @@ -332,8 +342,8 @@ class PhasingUtils { /** * Check if can merge variants * - * @param vc1 variant 1 - * @param vc2 variant 2 + * @param vc1 variant context 1 + * @param vc2 variant context 2 * @return true if variants are phased or either is a homozygous, false otherwise */ static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { @@ -386,23 +396,26 @@ class PhasingUtils { } /** - * Check of any samples have alternate alleles + * Check if some samples have double alternate alleles * - * @param vc1 variant 1 - * @param vc2 variant 2 - * @return true if there is a sample with alternate alleles, false otherwise + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @return true if there is a sample with double alternate alleles, false otherwise */ static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { for (final Genotype gt1 : vc1.getGenotypes()) { + // gt2 from the same sample as gt1 final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); - // Find the alleles with the same haplotype - final SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2); + if ( gt2 != null ) { + // Find the alleles with the same haplotype + final SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2); - // Find corresponding alternate alleles - for (AlleleOneAndTwo all1all2 : hapAlleles.hapAlleles) { - if (all1all2.all1.isNonReference() && all1all2.all2.isNonReference()) - return true; + // Find corresponding alternate alleles + for (AlleleOneAndTwo all1all2 : hapAlleles.hapAlleles) { + if (all1all2.all1.isNonReference() && all1all2.all2.isNonReference()) + return true; + } } } @@ -412,9 +425,9 @@ class PhasingUtils { /** * Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference) * - * @param vc1 variant 1 - * @param vc2 variant 2 - * @return true if + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @return true if alleles segregate together, false otherwise */ static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { final Map allele1ToAllele2 = new HashMap(); @@ -535,18 +548,20 @@ class PhasingUtils { * Constructor * * @param intermediateBases array of bases - * @param vc1 variant 1 - * @param vc2 variant 2 + * @param vc1 variant context 1 + * @param vc2 variant context 2 */ public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo this.intermediateBases = intermediateBases; this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; + // merge the reference bases from vc1 before and vc2 after the reference (intermediate) bases this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); } /** + * Ensure that the alleles are merged. The merged allele is alternate. * * @param all1 allele 1 * @param all2 allele 2 @@ -558,11 +573,11 @@ class PhasingUtils { /** * Ensure that the alleles are merged. - * Put the bases from all1 before and all2 after the previously merged bases + * Put the bases from all1 before and all2 after the intermediate bases * * @param all1 allele 1 * @param all2 allele 2 - * @param isRef is this a reference allele + * @param isRef if true, merged allele is reference, if false, merged allele is alternate * @return merged allele */ private Allele ensureMergedAllele(Allele all1, Allele all2, boolean isRef) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java index b48db1894..6dfe0f416 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java @@ -56,6 +56,7 @@ import htsjdk.variant.variantcontext.*; import java.io.File; import java.io.FileNotFoundException; import java.util.Arrays; +import java.util.ArrayList; import java.util.List; import java.util.HashMap; import java.util.Map; @@ -65,7 +66,6 @@ import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; class AlwaysTrueMergeRule extends PhasingUtils.AlleleMergeRule { @@ -93,8 +93,8 @@ public class PhasingUtilsUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(referenceFile); alleleList1 = Arrays.asList(Allele.create("T", true), Allele.create("C", false)); alleleList2 = Arrays.asList(Allele.create("G", true), Allele.create("A", false)); - genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList1).make(); - genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).alleles(alleleList2).make(); + genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).make(); + genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).make(); contig = new String("1"); vc1 = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype1).make(); vc2 = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype2).make(); @@ -187,13 +187,22 @@ public class PhasingUtilsUnitTest extends BaseTest { } @Test - public void TestSomeSampleHasDoubleNonReferenceAllele(){ + public void TestSomeSampleHasDoubleNonReferenceAlleleTrue(){ + Genotype genotype = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList2).make(); + VariantContext vc = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype).make(); + Assert.assertTrue(PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc)); + } + + @Test + public void TestSomeSampleHasDoubleNonReferenceAlleleFalse(){ Assert.assertFalse(PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2)); } @Test public void TestDoubleAllelesSegregatePerfectlyAmongSamples(){ - Assert.assertFalse(PhasingUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2)); + Genotype genotype = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList2).make(); + VariantContext vc = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype).make(); + Assert.assertTrue(PhasingUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc)); } @Test @@ -207,18 +216,58 @@ public class PhasingUtilsUnitTest extends BaseTest { AlwaysTrueMergeRule alleleMergeRule = new AlwaysTrueMergeRule(); VariantContext vc = PhasingUtils.mergeIntoMNP(genomeLocParser, vc1, vc2, referenceFile, alleleMergeRule); - Map attributes = new HashMap(); - attributes.put("AC", new String[]{"1", "1"}); - attributes.put("AF", new String[]{"0.5", "0.5"}); - attributes.put("AN", new String("2")); List alleleList = Arrays.asList(Allele.create("TG", true), Allele.create("TA", false), Allele.create("CG", false)); - Genotype genotype = new GenotypeBuilder().name("sample").attributes(attributes).alleles(alleleList).make(); - final VariantContext vcExpected = new VariantContextBuilder().chr(contig).id("id1;id2").source("TC_AC").start(start).stop(start+1).alleles(alleleList).genotypes(genotype).make(); + Map attributes = new HashMap(){{ + put("AC", new ArrayList(Arrays.asList(1, 1))); + put("AF", new ArrayList(Arrays.asList(0.5, 0.5))); + put("AN", 2); + }}; + final Map extendedAttributes = new HashMap(){{ + put("PQ", 100.0); put("HP", new String[]{"10-1", "10-2"}); + }}; + List alleleListMeged = Arrays.asList(Allele.create("TA"), Allele.create("CG")); + Genotype genotype = new GenotypeBuilder().name("sample").attributes(extendedAttributes).alleles(alleleListMeged).make(); + final VariantContext vcExpected = new VariantContextBuilder().chr(contig).id("id1;id2").source("TC_GA").start(start).stop(start+1).alleles(alleleList).genotypes(genotype).attributes(attributes).make(); + Assert.assertTrue(genotype.sameGenotype(vcExpected.getGenotypes().get("sample"))); + Assert.assertTrue(vcExpected.hasSameAllelesAs(vc)); + Assert.assertEquals(vcExpected.getChr(), vc.getChr()); + Assert.assertEquals(vcExpected.getStart(), vc.getStart()); + Assert.assertEquals(vcExpected.getEnd(), vc.getEnd()); + Assert.assertEquals(vcExpected.getID(), vc.getID()); + Assert.assertEquals(vcExpected.getSource(), vc.getSource()); + Assert.assertEquals(vcExpected.isFiltered(), vc.isFiltered()); + Assert.assertEquals(vcExpected.getPhredScaledQual(), vc.getPhredScaledQual()); + Assert.assertEquals(vcExpected.getAttribute("PQ"), vc.getAttribute("PQ")); + Assert.assertEquals(vcExpected.getAttribute("HP"), vc.getAttribute("HP")); } @Test - public void TestReallyMergeIntoMNP(final VariantContext vc1, final VariantContext vc2 ){ + public void TestReallyMergeIntoMNP( ){ VariantContext vc = PhasingUtils.reallyMergeIntoMNP(vc1, vc2, referenceFile); + + List alleleList = Arrays.asList(Allele.create("TG", true), Allele.create("TA", false), Allele.create("CG", false)); + Map attributes = new HashMap(){{ + put("AC", new ArrayList(Arrays.asList(1, 1))); + put("AF", new ArrayList(Arrays.asList(0.5, 0.5))); + put("AN", 2); + }}; + final Map extendedAttributes = new HashMap(){{ + put("PQ", 100.0); put("HP", new String[]{"10-1", "10-2"}); + }}; + List alleleListMeged = Arrays.asList(Allele.create("TA"), Allele.create("CG")); + Genotype genotype = new GenotypeBuilder().name("sample").attributes(extendedAttributes).alleles(alleleListMeged).make(); + final VariantContext vcExpected = new VariantContextBuilder().chr(contig).id("id1;id2").source("TC_GA").start(start).stop(start+1).alleles(alleleList).genotypes(genotype).attributes(attributes).make(); + Assert.assertTrue(genotype.sameGenotype(vcExpected.getGenotypes().get("sample"))); + Assert.assertTrue(vcExpected.hasSameAllelesAs(vc)); + Assert.assertEquals(vcExpected.getChr(), vc.getChr()); + Assert.assertEquals(vcExpected.getStart(), vc.getStart()); + Assert.assertEquals(vcExpected.getEnd(), vc.getEnd()); + Assert.assertEquals(vcExpected.getID(), vc.getID()); + Assert.assertEquals(vcExpected.getSource(), vc.getSource()); + Assert.assertEquals(vcExpected.isFiltered(), vc.isFiltered()); + Assert.assertEquals(vcExpected.getPhredScaledQual(), vc.getPhredScaledQual()); + Assert.assertEquals(vcExpected.getAttribute("PQ"), vc.getAttribute("PQ")); + Assert.assertEquals(vcExpected.getAttribute("HP"), vc.getAttribute("HP")); } @Test