Add comments, alleleSegregationIsKnown() check is added to matchHaplotypeAlleles()

This commit is contained in:
Ron Levine 2014-12-17 03:25:26 -05:00
parent c9175eeee8
commit 56f8e4f9cf
2 changed files with 170 additions and 46 deletions

View File

@ -63,31 +63,7 @@ import htsjdk.variant.variantcontext.*;
import java.util.*; import java.util.*;
/** /**
* [Short one sentence description of this walker] * Utility class for phasing analyis
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h3>Input</h3>
* <p>
* [Input description]
* </p>
* <p/>
* <h3>Output</h3>
* <p>
* [Output description]
* </p>
* <p/>
* <h3>Examples</h3>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/ */
class PhasingUtils { class PhasingUtils {
static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) {
@ -105,33 +81,53 @@ class PhasingUtils {
return reallyMergeIntoMNP(vc1, vc2, referenceFile); return reallyMergeIntoMNP(vc1, vc2, referenceFile);
} }
// Assumes: alleleSegregationIsKnown(gt1, gt2) /**
* Find the alleles with the same haplotype
*
* @param gt1 genotype 1
* @param gt2 genotype 2
* @return gt1 and gt2 alleles with the same haplotype
*/
static SameHaplotypeAlleles matchHaplotypeAlleles(final Genotype gt1, final Genotype gt2) { static SameHaplotypeAlleles matchHaplotypeAlleles(final Genotype gt1, final Genotype gt2) {
final SameHaplotypeAlleles hapAlleles = new SameHaplotypeAlleles(); final SameHaplotypeAlleles hapAlleles = new SameHaplotypeAlleles();
final int nalleles = gt1.getPloidy(); // Skip if can not merge the haplotype alleles
final Allele[] site1AllelesArray = gt1.getAlleles().toArray(new Allele[nalleles]); if ( !alleleSegregationIsKnown(gt1, gt2) ){
final Allele[] site2AllelesArray = gt2.getAlleles().toArray(new Allele[nalleles]); return hapAlleles;
}
final int[] site1ToSite2Inds = new int[nalleles]; // Get the alleles
// If both genotypes have read-backed phasing haplotype identifiers final int numAlleles = gt1.getPloidy();
final Allele[] site1AllelesArray = gt1.getAlleles().toArray(new Allele[numAlleles]);
final Allele[] site2AllelesArray = gt2.getAlleles().toArray(new Allele[numAlleles]);
// locations of the same HP attribute in gt2 to gt2
final int[] site1ToSite2Inds = new int[numAlleles];
// If both genotypes have read-backed phasing haplotype identifiers (HP)
// Find the gt1 and gt2 alleles with the same haplotpe
if (gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) && gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) { if (gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) && gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) {
final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY); final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY);
final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY); final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY);
final HashMap<String, Integer> HPnameToSite1Inds = new HashMap<String, Integer>(); // Map of HP attribute to it's array index
final HashMap<String, Integer> hpNameToSite1Inds = new HashMap<String, Integer>();
// Hp name to index
for (int ind1 = 0; ind1 < hp1.length; ++ind1) { for (int ind1 = 0; ind1 < hp1.length; ++ind1) {
final String h1 = hp1[ind1]; hpNameToSite1Inds.put(hp1[ind1], ind1);
HPnameToSite1Inds.put(h1, ind1);
} }
// Find the index of the gt2 HP attribute in gt1 HP attribute array
for (int ind2 = 0; ind2 < hp2.length; ++ind2) { for (int ind2 = 0; ind2 < hp2.length; ++ind2) {
final String h2 = hp2[ind2]; final int ind1 = hpNameToSite1Inds.get(hp2[ind2]);
final int ind1 = HPnameToSite1Inds.get(h2);
// attributes are not in the same position in both genotypes
if (ind2 != ind1) if (ind2 != ind1)
hapAlleles.requiresSwap = true; hapAlleles.requiresSwap = true;
site1ToSite2Inds[ind1] = ind2; // this is OK, since allSamplesAreMergeable()
site1ToSite2Inds[ind1] = ind2;
} }
} }
else { // gt1.isHom() || gt2.isHom() ; so, we trivially merge the corresponding alleles else { // gt1.isHom() || gt2.isHom() ; so, we trivially merge the corresponding alleles
@ -139,17 +135,27 @@ class PhasingUtils {
site1ToSite2Inds[ind] = ind; site1ToSite2Inds[ind] = ind;
} }
for (int ind1 = 0; ind1 < nalleles; ++ind1) { // Get the alleles for gt1 and gt2 with the same haplotype
for (int ind1 = 0; ind1 < numAlleles; ++ind1) {
final Allele all1 = site1AllelesArray[ind1]; final Allele all1 = site1AllelesArray[ind1];
final int ind2 = site1ToSite2Inds[ind1]; final int ind2 = site1ToSite2Inds[ind1];
final Allele all2 = site2AllelesArray[ind2]; // this is OK, since alleleSegregationIsKnown(gt1, gt2) final Allele all2 = site2AllelesArray[ind2]; // this is OK, since alleleSegregationIsKnown(gt1, gt2)
// add the 2 alleles
hapAlleles.hapAlleles.add(new AlleleOneAndTwo(all1, all2)); hapAlleles.hapAlleles.add(new AlleleOneAndTwo(all1, all2));
} }
return hapAlleles; return hapAlleles;
} }
/**
*
* @param vc1 variant 1
* @param vc2 variant 2
* @param referenceFile sequence file containing the reference genome
* @return variant with the merged MNP
*/
static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
final int startInter = vc1.getEnd() + 1; final int startInter = vc1.getEnd() + 1;
final int endInter = vc2.getStart() - 1; final int endInter = vc2.getStart() - 1;
@ -217,10 +223,23 @@ class PhasingUtils {
return mergedBuilder.make(); return mergedBuilder.make();
} }
/**
* Merge two variant context names together with an underscore betwwen them
*
* @param name1 variant 1 name
* @param name2 variant 2 name
* @return combined variant names
*/
static String mergeVariantContextNames(String name1, String name2) { static String mergeVariantContextNames(String name1, String name2) {
return name1 + "_" + name2; return name1 + "_" + name2;
} }
/**
*
* @param vc1
* @param vc2
* @return merged variant atrtributes
*/
static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) {
Map<String, Object> mergedAttribs = new HashMap<String, Object>(); Map<String, Object> mergedAttribs = new HashMap<String, Object>();
@ -243,6 +262,14 @@ class PhasingUtils {
return mergedAttribs; return mergedAttribs;
} }
/**
* Check if variants can be moved into the MNP
*
* @param genomeLocParser parse the genome locations
* @param vc1 variant 1
* @param vc2 variant 2
* @return true if variants are an SNP or MNP, on the same contig, location 1 is not before location 2, unfiltered, from the same sample set and are called,
*/
static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) {
// Can only merge "simple" base strings (i.e., SNPs or MNPs, but not indels): // Can only merge "simple" base strings (i.e., SNPs or MNPs, but not indels):
final boolean vc1CanBeMerged = vc1.isSNP() || vc1.isMNP(); final boolean vc1CanBeMerged = vc1.isSNP() || vc1.isMNP();
@ -253,18 +280,23 @@ class PhasingUtils {
final GenomeLoc loc1 = GATKVariantContextUtils.getLocation(genomeLocParser, vc1); final GenomeLoc loc1 = GATKVariantContextUtils.getLocation(genomeLocParser, vc1);
final GenomeLoc loc2 = GATKVariantContextUtils.getLocation(genomeLocParser, vc2); final GenomeLoc loc2 = GATKVariantContextUtils.getLocation(genomeLocParser, vc2);
// Must be on same contig
if (!loc1.onSameContig(loc2)) if (!loc1.onSameContig(loc2))
return false; return false;
// Variant 1 location must not be before variant 2
if (!loc1.isBefore(loc2)) if (!loc1.isBefore(loc2))
return false; return false;
// Variants can not be filtered
if (vc1.isFiltered() || vc2.isFiltered()) if (vc1.isFiltered() || vc2.isFiltered())
return false; return false;
if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets // Variants must come from the same sample set
if (!vc1.getSampleNames().equals(vc2.getSampleNames()))
return false; return false;
// All of the variant genotypes must be unfiltered and called
if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2))
return false; return false;
@ -280,6 +312,13 @@ class PhasingUtils {
return true; return true;
} }
/**
* Check if can merge merge variants if phased or either is a homozygous
*
* @param vc1 variant 1
* @param vc2 variant 2
* @return true if variants are phased or either is a homozygous, false otherwise
*/
static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) {
// Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1:
for (final Genotype gt1 : vc1.getGenotypes()) { for (final Genotype gt1 : vc1.getGenotypes()) {
@ -292,23 +331,34 @@ class PhasingUtils {
return true; return true;
} }
/**
* Check if the allele segregation is known
*
* @param gt1 genotype 1
* @param gt2 genotype 2
* @return true if genotypes have the same number of chromosomes, haplotype, number of attributes
* as chromosomes, and either genotype is homozygous, false otherwise
*/
static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
// If gt1 or gt2 do not have the same number of chromosomes, then can not be merged.
if (gt1.getPloidy() != gt2.getPloidy()) if (gt1.getPloidy() != gt2.getPloidy())
return false; return false;
// If gt1 or gt2 are hom, then could be MERGED: // If gt1 or gt2 are homozygous, then could be merged.
if (gt1.isHom() || gt2.isHom()) if (gt1.isHom() || gt2.isHom())
return true; return true;
// Otherwise, need to check that alleles from gt1 can be matched up with alleles from gt2: // If gt1 or gt2 do not read backed phasing haplotype, then can not be merged
if (!gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) || !gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) if (!gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) || !gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY))
return false; return false;
// If gt1 or gt2 do not same number of HP attributes as chromosomes, then can not be merged.
final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY); final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY);
final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY); final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY);
if (hp1.length != gt1.getPloidy() || hp2.length != gt2.getPloidy()) if (hp1.length != gt1.getPloidy() || hp2.length != gt2.getPloidy())
return false; return false;
// gt1 and gt2 must have the same read backed phasing haplotype identifier attributes to be merged
final String[] hp1Copy = Arrays.copyOf(hp1, hp1.length); final String[] hp1Copy = Arrays.copyOf(hp1, hp1.length);
final String[] hp2Copy = Arrays.copyOf(hp2, hp2.length); final String[] hp2Copy = Arrays.copyOf(hp2, hp2.length);
Arrays.sort(hp1Copy); Arrays.sort(hp1Copy);
@ -316,6 +366,13 @@ class PhasingUtils {
return (Arrays.equals(hp1Copy, hp2Copy)); // The haplotype names match (though possibly in a different order) return (Arrays.equals(hp1Copy, hp2Copy)); // The haplotype names match (though possibly in a different order)
} }
/**
*
* @param vc1 variant 1
* @param vc2 variant 2
* @return true if
*/
static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
for (final Genotype gt1 : vc1.getGenotypes()) { for (final Genotype gt1 : vc1.getGenotypes()) {
final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); final Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
@ -330,6 +387,13 @@ class PhasingUtils {
return false; return false;
} }
/**
*
* @param vc1 variant 1
* @param vc2 variant 2
* @return true if
*/
static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) {
// Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference):
final Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>(); final Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
@ -365,6 +429,9 @@ class PhasingUtils {
return true; return true;
} }
/**
*
*/
abstract static class AlleleMergeRule { abstract static class AlleleMergeRule {
// vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2):
abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2);
@ -374,8 +441,15 @@ class PhasingUtils {
} }
} }
/**
* Class for sore the alleles with the same haplotype
*/
static class SameHaplotypeAlleles { static class SameHaplotypeAlleles {
/// Alleles are not in the same order
public boolean requiresSwap; public boolean requiresSwap;
/// Lisgt of gthe 2 alleles with the same haplotype
public List<AlleleOneAndTwo> hapAlleles; public List<AlleleOneAndTwo> hapAlleles;
public SameHaplotypeAlleles() { public SameHaplotypeAlleles() {
@ -384,19 +458,41 @@ class PhasingUtils {
} }
} }
/**
* Class for holding 2 alleles
*/
static class AlleleOneAndTwo { static class AlleleOneAndTwo {
/// allele 1
private Allele all1; private Allele all1;
/// allele2
private Allele all2; private Allele all2;
/**
* Constructor
*
* @param all1 allele 1
* @param all2 allele 2
*/
public AlleleOneAndTwo(Allele all1, Allele all2) { public AlleleOneAndTwo(Allele all1, Allele all2) {
this.all1 = all1; this.all1 = all1;
this.all2 = all2; this.all2 = all2;
} }
/**
* Get the hah code for alleles 1 and 2
*
* @return hasb code for alleles 1 and 2
*/
public int hashCode() { public int hashCode() {
return all1.hashCode() + all2.hashCode(); return all1.hashCode() + all2.hashCode();
} }
/**
* Check if equal to another 2 alleles
*
* @param other allele to compare to
* @return true if equal, false otherwise
*/
public boolean equals(Object other) { public boolean equals(Object other) {
if (!(other instanceof AlleleOneAndTwo)) if (!(other instanceof AlleleOneAndTwo))
return false; return false;
@ -406,11 +502,21 @@ class PhasingUtils {
} }
} }
/**
* Class for merging alleles data
*/
static class MergedAllelesData { static class MergedAllelesData {
private Map<AlleleOneAndTwo, Allele> mergedAlleles; private Map<AlleleOneAndTwo, Allele> mergedAlleles;
private byte[] intermediateBases; private byte[] intermediateBases;
private int intermediateLength; private int intermediateLength;
/**
*
* @param intermediateBases
* @param vc1 variant 1
* @param vc2 variant 2
*/
public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) {
this.mergedAlleles = new HashMap<AlleleOneAndTwo, Allele>(); // implemented equals() and hashCode() for AlleleOneAndTwo this.mergedAlleles = new HashMap<AlleleOneAndTwo, Allele>(); // implemented equals() and hashCode() for AlleleOneAndTwo
this.intermediateBases = intermediateBases; this.intermediateBases = intermediateBases;
@ -419,10 +525,23 @@ class PhasingUtils {
this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true);
} }
/**
*
* @param all1 allele 1
* @param all2 allele 2
* @return
*/
public Allele ensureMergedAllele(Allele all1, Allele all2) { public Allele ensureMergedAllele(Allele all1, Allele all2) {
return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor
} }
/**
*
* @param all1 allele 1
* @param all2 allele 2
* @param creatingReferenceForFirstTime
* @return
*/
private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) {
AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2);
Allele mergedAllele = mergedAlleles.get(all12); Allele mergedAllele = mergedAlleles.get(all12);
@ -444,6 +563,10 @@ class PhasingUtils {
return mergedAllele; return mergedAllele;
} }
/**
*
* @return set of merged alleles values
*/
public Set<Allele> getAllMergedAlleles() { public Set<Allele> getAllMergedAlleles() {
return new HashSet<Allele>(mergedAlleles.values()); return new HashSet<Allele>(mergedAlleles.values());
} }

View File

@ -77,16 +77,17 @@ public class PhasingUtilsUnitTest extends BaseTest {
public void init() throws FileNotFoundException { public void init() throws FileNotFoundException {
referenceFile = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); referenceFile = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
ref = Allele.create("C", true); ref = Allele.create("C", true);
alt = Allele.create("T", true); alt = Allele.create("T", false);
} }
@Test @Test
public void TestMatchHaplotypeAllelesKeyHP() { public void TestMatchHaplotypeAllelesKeyHP() {
final List<Allele> alleleList = new ArrayList<>(); final List<Allele> alleleList = new ArrayList<>();
alleleList.add(Allele.create("T", false)); alleleList.add(Allele.create("T", true));
alleleList.add(Allele.create("C", false)); alleleList.add(Allele.create("C", false));
final Genotype genotypeALT = new GenotypeBuilder("TC").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList).make(); String samelpName = "TC";
final Genotype genotypeALT = new GenotypeBuilder(samelpName).attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList).make();
// Must match because the same genotype // Must match because the same genotype
PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotypeALT, genotypeALT); PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotypeALT, genotypeALT);