diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java index 42d01d4cc..6be2835d0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java @@ -63,31 +63,7 @@ import htsjdk.variant.variantcontext.*; import java.util.*; /** - * [Short one sentence description of this walker] - *

- *

- * [Functionality of this walker] - *

- *

- *

Input

- *

- * [Input description] - *

- *

- *

Output

- *

- * [Output description] - *

- *

- *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T $WalkerName
- *  
- * - * @author Your Name - * @since Date created + * Utility class for phasing analyis */ class PhasingUtils { static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { @@ -105,33 +81,53 @@ class PhasingUtils { return reallyMergeIntoMNP(vc1, vc2, referenceFile); } - // Assumes: alleleSegregationIsKnown(gt1, gt2) + /** + * Find the alleles with the same haplotype + * + * @param gt1 genotype 1 + * @param gt2 genotype 2 + * @return gt1 and gt2 alleles with the same haplotype + */ static SameHaplotypeAlleles matchHaplotypeAlleles(final Genotype gt1, final Genotype gt2) { + final SameHaplotypeAlleles hapAlleles = new SameHaplotypeAlleles(); - final int nalleles = gt1.getPloidy(); - final Allele[] site1AllelesArray = gt1.getAlleles().toArray(new Allele[nalleles]); - final Allele[] site2AllelesArray = gt2.getAlleles().toArray(new Allele[nalleles]); + // Skip if can not merge the haplotype alleles + if ( !alleleSegregationIsKnown(gt1, gt2) ){ + return hapAlleles; + } - final int[] site1ToSite2Inds = new int[nalleles]; - // If both genotypes have read-backed phasing haplotype identifiers + // Get the alleles + final int numAlleles = gt1.getPloidy(); + final Allele[] site1AllelesArray = gt1.getAlleles().toArray(new Allele[numAlleles]); + final Allele[] site2AllelesArray = gt2.getAlleles().toArray(new Allele[numAlleles]); + + // locations of the same HP attribute in gt2 to gt2 + final int[] site1ToSite2Inds = new int[numAlleles]; + + // If both genotypes have read-backed phasing haplotype identifiers (HP) + // Find the gt1 and gt2 alleles with the same haplotpe if (gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) && gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) { final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY); final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY); - final HashMap HPnameToSite1Inds = new HashMap(); + // Map of HP attribute to it's array index + final HashMap hpNameToSite1Inds = new HashMap(); + + // Hp name to index for (int ind1 = 0; ind1 < hp1.length; ++ind1) { - final String h1 = hp1[ind1]; - HPnameToSite1Inds.put(h1, ind1); + hpNameToSite1Inds.put(hp1[ind1], ind1); } + // Find the index of the gt2 HP attribute in gt1 HP attribute array for (int ind2 = 0; ind2 < hp2.length; ++ind2) { - final String h2 = hp2[ind2]; - final int ind1 = HPnameToSite1Inds.get(h2); + final int ind1 = hpNameToSite1Inds.get(hp2[ind2]); + // attributes are not in the same position in both genotypes if (ind2 != ind1) hapAlleles.requiresSwap = true; - site1ToSite2Inds[ind1] = ind2; // this is OK, since allSamplesAreMergeable() + + site1ToSite2Inds[ind1] = ind2; } } else { // gt1.isHom() || gt2.isHom() ; so, we trivially merge the corresponding alleles @@ -139,17 +135,27 @@ class PhasingUtils { site1ToSite2Inds[ind] = ind; } - for (int ind1 = 0; ind1 < nalleles; ++ind1) { + // Get the alleles for gt1 and gt2 with the same haplotype + for (int ind1 = 0; ind1 < numAlleles; ++ind1) { final Allele all1 = site1AllelesArray[ind1]; final int ind2 = site1ToSite2Inds[ind1]; final Allele all2 = site2AllelesArray[ind2]; // this is OK, since alleleSegregationIsKnown(gt1, gt2) + // add the 2 alleles hapAlleles.hapAlleles.add(new AlleleOneAndTwo(all1, all2)); } return hapAlleles; } + /** + * + * @param vc1 variant 1 + * @param vc2 variant 2 + * @param referenceFile sequence file containing the reference genome + * @return variant with the merged MNP + */ + static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { final int startInter = vc1.getEnd() + 1; final int endInter = vc2.getStart() - 1; @@ -217,10 +223,23 @@ class PhasingUtils { return mergedBuilder.make(); } + /** + * Merge two variant context names together with an underscore betwwen them + * + * @param name1 variant 1 name + * @param name2 variant 2 name + * @return combined variant names + */ static String mergeVariantContextNames(String name1, String name2) { return name1 + "_" + name2; } + /** + * + * @param vc1 + * @param vc2 + * @return merged variant atrtributes + */ static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { Map mergedAttribs = new HashMap(); @@ -243,6 +262,14 @@ class PhasingUtils { return mergedAttribs; } + /** + * Check if variants can be moved into the MNP + * + * @param genomeLocParser parse the genome locations + * @param vc1 variant 1 + * @param vc2 variant 2 + * @return true if variants are an SNP or MNP, on the same contig, location 1 is not before location 2, unfiltered, from the same sample set and are called, + */ static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { // Can only merge "simple" base strings (i.e., SNPs or MNPs, but not indels): final boolean vc1CanBeMerged = vc1.isSNP() || vc1.isMNP(); @@ -253,18 +280,23 @@ class PhasingUtils { final GenomeLoc loc1 = GATKVariantContextUtils.getLocation(genomeLocParser, vc1); final GenomeLoc loc2 = GATKVariantContextUtils.getLocation(genomeLocParser, vc2); + // Must be on same contig if (!loc1.onSameContig(loc2)) return false; + // Variant 1 location must not be before variant 2 if (!loc1.isBefore(loc2)) return false; + // Variants can not be filtered if (vc1.isFiltered() || vc2.isFiltered()) return false; - if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets + // Variants must come from the same sample set + if (!vc1.getSampleNames().equals(vc2.getSampleNames())) return false; + // All of the variant genotypes must be unfiltered and called if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) return false; @@ -280,6 +312,13 @@ class PhasingUtils { return true; } + /** + * Check if can merge merge variants if phased or either is a homozygous + * + * @param vc1 variant 1 + * @param vc2 variant 2 + * @return true if variants are phased or either is a homozygous, false otherwise + */ static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: for (final Genotype gt1 : vc1.getGenotypes()) { @@ -292,30 +331,48 @@ class PhasingUtils { return true; } + /** + * Check if the allele segregation is known + * + * @param gt1 genotype 1 + * @param gt2 genotype 2 + * @return true if genotypes have the same number of chromosomes, haplotype, number of attributes + * as chromosomes, and either genotype is homozygous, false otherwise + */ static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { + // If gt1 or gt2 do not have the same number of chromosomes, then can not be merged. if (gt1.getPloidy() != gt2.getPloidy()) return false; - // If gt1 or gt2 are hom, then could be MERGED: + // If gt1 or gt2 are homozygous, then could be merged. if (gt1.isHom() || gt2.isHom()) return true; - // Otherwise, need to check that alleles from gt1 can be matched up with alleles from gt2: + // If gt1 or gt2 do not read backed phasing haplotype, then can not be merged if (!gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) || !gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) return false; + // If gt1 or gt2 do not same number of HP attributes as chromosomes, then can not be merged. final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY); final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY); if (hp1.length != gt1.getPloidy() || hp2.length != gt2.getPloidy()) return false; - final String[] hp1Copy = Arrays.copyOf(hp1, hp1.length); - final String[] hp2Copy = Arrays.copyOf(hp2, hp2.length); + // gt1 and gt2 must have the same read backed phasing haplotype identifier attributes to be merged + final String[] hp1Copy = Arrays.copyOf(hp1, hp1.length); + final String[] hp2Copy = Arrays.copyOf(hp2, hp2.length); Arrays.sort(hp1Copy); Arrays.sort(hp2Copy); return (Arrays.equals(hp1Copy, hp2Copy)); // The haplotype names match (though possibly in a different order) } + /** + * + * @param vc1 variant 1 + * @param vc2 variant 2 + * @return true if + */ + static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { for (final Genotype gt1 : vc1.getGenotypes()) { final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); @@ -330,6 +387,13 @@ class PhasingUtils { return false; } + /** + * + * @param vc1 variant 1 + * @param vc2 variant 2 + * @return true if + */ + static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): final Map allele1ToAllele2 = new HashMap(); @@ -365,6 +429,9 @@ class PhasingUtils { return true; } + /** + * + */ abstract static class AlleleMergeRule { // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); @@ -374,8 +441,15 @@ class PhasingUtils { } } + /** + * Class for sore the alleles with the same haplotype + */ static class SameHaplotypeAlleles { + + /// Alleles are not in the same order public boolean requiresSwap; + + /// Lisgt of gthe 2 alleles with the same haplotype public List hapAlleles; public SameHaplotypeAlleles() { @@ -384,19 +458,41 @@ class PhasingUtils { } } + /** + * Class for holding 2 alleles + */ static class AlleleOneAndTwo { + /// allele 1 private Allele all1; + /// allele2 private Allele all2; + /** + * Constructor + * + * @param all1 allele 1 + * @param all2 allele 2 + */ public AlleleOneAndTwo(Allele all1, Allele all2) { this.all1 = all1; this.all2 = all2; } + /** + * Get the hah code for alleles 1 and 2 + * + * @return hasb code for alleles 1 and 2 + */ public int hashCode() { return all1.hashCode() + all2.hashCode(); } + /** + * Check if equal to another 2 alleles + * + * @param other allele to compare to + * @return true if equal, false otherwise + */ public boolean equals(Object other) { if (!(other instanceof AlleleOneAndTwo)) return false; @@ -406,11 +502,21 @@ class PhasingUtils { } } + /** + * Class for merging alleles data + */ static class MergedAllelesData { private Map mergedAlleles; private byte[] intermediateBases; private int intermediateLength; + /** + * + * @param intermediateBases + * @param vc1 variant 1 + * @param vc2 variant 2 + */ + public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo this.intermediateBases = intermediateBases; @@ -419,10 +525,23 @@ class PhasingUtils { this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); } + /** + * + * @param all1 allele 1 + * @param all2 allele 2 + * @return + */ public Allele ensureMergedAllele(Allele all1, Allele all2) { return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor } + /** + * + * @param all1 allele 1 + * @param all2 allele 2 + * @param creatingReferenceForFirstTime + * @return + */ private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); Allele mergedAllele = mergedAlleles.get(all12); @@ -444,6 +563,10 @@ class PhasingUtils { return mergedAllele; } + /** + * + * @return set of merged alleles values + */ public Set getAllMergedAlleles() { return new HashSet(mergedAlleles.values()); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java index 23aac6ef3..6737ec255 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java @@ -77,16 +77,17 @@ public class PhasingUtilsUnitTest extends BaseTest { public void init() throws FileNotFoundException { referenceFile = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); ref = Allele.create("C", true); - alt = Allele.create("T", true); + alt = Allele.create("T", false); } @Test public void TestMatchHaplotypeAllelesKeyHP() { final List alleleList = new ArrayList<>(); - alleleList.add(Allele.create("T", false)); + alleleList.add(Allele.create("T", true)); alleleList.add(Allele.create("C", false)); - final Genotype genotypeALT = new GenotypeBuilder("TC").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList).make(); + String samelpName = "TC"; + final Genotype genotypeALT = new GenotypeBuilder(samelpName).attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList).make(); // Must match because the same genotype PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotypeALT, genotypeALT);