Merge pull request #718 from broadinstitute/mf_rbp_fix

Fix MNP merging code to work with explicit HP phase representation
This commit is contained in:
Eric Banks 2014-09-02 20:39:22 -04:00
commit 538537dbf1
1 changed files with 133 additions and 97 deletions

View File

@ -101,59 +101,112 @@ class PhasingUtils {
return reallyMergeIntoMNP(vc1, vc2, referenceFile);
}
// Assumes: alleleSegregationIsKnown(gt1, gt2)
static SameHaplotypeAlleles matchHaplotypeAlleles(final Genotype gt1, final Genotype gt2) {
final SameHaplotypeAlleles hapAlleles = new SameHaplotypeAlleles();
final int nalleles = gt1.getPloidy();
final Allele[] site1AllelesArray = gt1.getAlleles().toArray(new Allele[nalleles]);
final Allele[] site2AllelesArray = gt2.getAlleles().toArray(new Allele[nalleles]);
final int[] site2Inds = new int[nalleles];
if (gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) && gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) {
final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY);
final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY);
final HashMap<String, Integer> site1Inds = new HashMap<String, Integer>();
for (int ind1 = 0; ind1 < hp1.length; ++ind1) {
final String h1 = hp1[ind1];
site1Inds.put(h1, ind1);
}
for (int ind2 = 0; ind2 < hp2.length; ++ind2) {
final String h2 = hp2[ind2];
final int ind1 = site1Inds.get(h2);
if (ind2 != ind1)
hapAlleles.requiresSwap = true;
site2Inds[ind2] = ind1; // this is OK, since allSamplesAreMergeable()
}
}
else { // gt1.isHom() || gt2.isHom() ; so, we trivially merge the corresponding alleles
for (int ind = 0; ind < site2Inds.length; ++ind)
site2Inds[ind] = ind;
}
for (int ind1 = 0; ind1 < nalleles; ++ind1) {
final Allele all1 = site1AllelesArray[ind1];
final int ind2 = site2Inds[ind1];
final Allele all2 = site2AllelesArray[ind2]; // this is OK, since alleleSegregationIsKnown(gt1, gt2)
hapAlleles.hapAlleles.add(new AlleleOneAndTwo(all1, all2));
}
return hapAlleles;
}
static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
int startInter = vc1.getEnd() + 1;
int endInter = vc2.getStart() - 1;
final int startInter = vc1.getEnd() + 1;
final int endInter = vc2.getStart() - 1;
byte[] intermediateBases = null;
if (startInter <= endInter) {
intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases();
StringUtil.toUpperCase(intermediateBases);
}
MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added
final MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added
GenotypesContext mergedGenotypes = GenotypesContext.create();
final GenotypesContext mergedGenotypes = GenotypesContext.create();
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
final Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
final SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2);
List<Allele> mergedAllelesForSample = new LinkedList<Allele>();
boolean isPhased = gt1.isPhased() && gt2.isPhased();
if (hapAlleles.requiresSwap) // at least one swap of allele order was necessary, so trio-based phasing order cannot be maintained
isPhased = false;
/* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records,
we preserve phase information (if any) relative to whatever precedes vc1:
*/
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2);
final List<Allele> mergedAllelesForSample = new LinkedList<Allele>();
for (AlleleOneAndTwo all1all2 : hapAlleles.hapAlleles) {
final Allele mergedAllele = mergeData.ensureMergedAllele(all1all2.all1, all1all2.all2);
mergedAllelesForSample.add(mergedAllele);
}
double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError());
final double mergedGQ = Math.min(gt1.getLog10PError(), gt2.getLog10PError());
Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
if (phaseQual.PQ != null)
mergedGtAttribs.put(ReadBackedPhasing.PQ_KEY, phaseQual.PQ);
final Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
Genotype mergedGt = new GenotypeBuilder(gt1.getSampleName(), mergedAllelesForSample).log10PError(mergedGQ).attributes(mergedGtAttribs).phased(phaseQual.isPhased).make();
double PQ = Double.MAX_VALUE;
if (gt1.hasAnyAttribute(ReadBackedPhasing.PQ_KEY)) {
PQ = Math.min(PQ, (double) gt1.getAnyAttribute(ReadBackedPhasing.PQ_KEY));
}
if (gt2.hasAnyAttribute(ReadBackedPhasing.PQ_KEY)) {
PQ = Math.min(PQ, (double) gt2.getAnyAttribute(ReadBackedPhasing.PQ_KEY));
}
if (PQ != Double.MAX_VALUE)
mergedGtAttribs.put(ReadBackedPhasing.PQ_KEY, PQ);
if (gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) {
mergedGtAttribs.put(ReadBackedPhasing.HP_KEY, gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY));
}
else if (gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) { // gt1 doesn't have, but merged (so gt1 is hom and can take gt2's haplotype names):
mergedGtAttribs.put(ReadBackedPhasing.HP_KEY, gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY));
}
final Genotype mergedGt = new GenotypeBuilder(gt1.getSampleName(), mergedAllelesForSample).log10PError(mergedGQ).attributes(mergedGtAttribs).phased(isPhased).make();
mergedGenotypes.add(mergedGt);
}
String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource());
double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError());
Set<String> mergedFilters = new HashSet<String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered
Map<String, Object> mergedAttribs = mergeVariantContextAttributes(vc1, vc2);
final String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource());
final double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError());
final Set<String> mergedFilters = new HashSet<String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered
final Map<String, Object> mergedAttribs = mergeVariantContextAttributes(vc1, vc2);
// ids
List<String> mergedIDs = new ArrayList<String>();
final List<String> mergedIDs = new ArrayList<String>();
if ( vc1.hasID() ) mergedIDs.add(vc1.getID());
if ( vc2.hasID() ) mergedIDs.add(vc2.getID());
String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs);
final String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs);
VariantContextBuilder mergedBuilder = new VariantContextBuilder(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()).id(mergedID).genotypes(mergedGenotypes).log10PError(mergedLog10PError).filters(mergedFilters).attributes(mergedAttribs);
final VariantContextBuilder mergedBuilder = new VariantContextBuilder(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()).id(mergedID).genotypes(mergedGenotypes).log10PError(mergedLog10PError).filters(mergedFilters).attributes(mergedAttribs);
VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true);
return mergedBuilder.make();
}
@ -165,11 +218,12 @@ class PhasingUtils {
static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) {
Map<String, Object> mergedAttribs = new HashMap<String, Object>();
List<VariantContext> vcList = new LinkedList<VariantContext>();
final List<VariantContext> vcList = new LinkedList<VariantContext>();
vcList.add(vc1);
vcList.add(vc2);
String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY};
//String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY};
final String[] MERGE_OR_ATTRIBS = {};
for (String orAttrib : MERGE_OR_ATTRIBS) {
boolean attribVal = false;
for (VariantContext vc : vcList) {
@ -184,14 +238,17 @@ class PhasingUtils {
}
static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) {
GenomeLoc loc1 = GATKVariantContextUtils.getLocation(genomeLocParser, vc1);
GenomeLoc loc2 = GATKVariantContextUtils.getLocation(genomeLocParser, vc2);
if (!vc1.isSNP() || !vc2.isSNP())
return false;
final GenomeLoc loc1 = GATKVariantContextUtils.getLocation(genomeLocParser, vc1);
final GenomeLoc loc2 = GATKVariantContextUtils.getLocation(genomeLocParser, vc2);
if (!loc1.onSameContig(loc2))
throw new ReviewedGATKException("Can only merge vc1, vc2 if on the same chromosome");
return false;
if (!loc1.isBefore(loc2))
throw new ReviewedGATKException("Can only merge if vc1 is BEFORE vc2");
return false;
if (vc1.isFiltered() || vc2.isFiltered())
return false;
@ -217,7 +274,7 @@ class PhasingUtils {
static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) {
// Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1:
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
final Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom
return false;
@ -230,47 +287,31 @@ class PhasingUtils {
if (gt1.getPloidy() != gt2.getPloidy())
return false;
/* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard].
// If gt1 or gt2 are hom, then could be MERGED:
if (gt1.isHom() || gt2.isHom())
return true;
HOWEVER, EVEN if this is not the case, but gt1.isHom(),
it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1.
*/
return (gt2.isPhased() || gt2.isHom() || gt1.isHom());
}
// Otherwise, need to check that alleles from gt1 can be matched up with alleles from gt2:
if (!gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) || !gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY))
return false;
static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) {
if (gt2.isPhased() || gt2.isHom())
return new PhaseAndQuality(gt1); // maintain the phase of gt1
final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY);
final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY);
if (hp1.length != gt1.getPloidy() || hp2.length != gt2.getPloidy())
return false;
if (!gt1.isHom())
throw new ReviewedGATKException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()");
/* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype
For example, if we're merging the third Genotype with the second one:
0/1
1|1
0/1
Then, we want to output:
0/1
1/2
*/
return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()]
Arrays.sort(hp1);
Arrays.sort(hp2);
return (Arrays.equals(hp1, hp2)); // The haplotype names match (though possibly in a different order)
}
static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
final Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate
final SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2);
for (AlleleOneAndTwo all1all2 : hapAlleles.hapAlleles) {
if (all1all2.all1.isNonReference() && all1all2.all2.isNonReference()) // corresponding alleles are alternate
return true;
}
}
@ -280,8 +321,8 @@ class PhasingUtils {
static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) {
// Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference):
Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();
final Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
final Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();
// Note the segregation of the alleles for the reference genome:
allele1ToAllele2.put(vc1.getReference(), vc2.getReference());
@ -289,22 +330,20 @@ class PhasingUtils {
// Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples).
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
final Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2);
for (AlleleOneAndTwo all1all2 : hapAlleles.hapAlleles) {
final Allele all1 = all1all2.all1;
final Allele all2 = all1all2.all2;
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next();
Allele all1To2 = allele1ToAllele2.get(all1);
final Allele all1To2 = allele1ToAllele2.get(all1);
if (all1To2 == null)
allele1ToAllele2.put(all1, all2);
else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2
return false;
Allele all2To1 = allele2ToAllele1.get(all2);
final Allele all2To1 = allele2ToAllele1.get(all2);
if (all2To1 == null)
allele2ToAllele1.put(all2, all1);
else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1
@ -324,6 +363,16 @@ class PhasingUtils {
}
}
static class SameHaplotypeAlleles {
public boolean requiresSwap;
public List<AlleleOneAndTwo> hapAlleles;
public SameHaplotypeAlleles() {
requiresSwap = false;
hapAlleles = new LinkedList<AlleleOneAndTwo>();
}
}
static class AlleleOneAndTwo {
private Allele all1;
private Allele all2;
@ -341,7 +390,7 @@ class PhasingUtils {
if (!(other instanceof AlleleOneAndTwo))
return false;
AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other;
final AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other;
return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2));
}
}
@ -368,10 +417,10 @@ class PhasingUtils {
Allele mergedAllele = mergedAlleles.get(all12);
if (mergedAllele == null) {
byte[] bases1 = all1.getBases();
byte[] bases2 = all2.getBases();
final byte[] bases1 = all1.getBases();
final byte[] bases2 = all2.getBases();
byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
final byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
System.arraycopy(bases1, 0, mergedBases, 0, bases1.length);
if (intermediateBases != null)
System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength);
@ -388,17 +437,4 @@ class PhasingUtils {
return new HashSet<Allele>(mergedAlleles.values());
}
}
static class PhaseAndQuality {
public boolean isPhased;
public Double PQ = null;
public PhaseAndQuality(Genotype gt) {
this.isPhased = gt.isPhased();
if (this.isPhased) {
this.PQ = gt.getAttributeAsDouble(ReadBackedPhasing.PQ_KEY, -1);
if ( this.PQ == -1 ) this.PQ = null;
}
}
}
}