Added VERY preliminary version for merging refseq annotations as SNPs are merged

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4698 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-11-17 16:49:12 +00:00
parent e2f7f33ce7
commit 2f3578182a
3 changed files with 267 additions and 41 deletions

View File

@ -48,7 +48,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
private GenomeLocParser genomeLocParser;
private ReferenceSequenceFile referenceFileForMNPmerging;
private int maxGenomicDistanceForMNP;
private MergeRule mergeRule;
private String useSingleSample = null;
@ -58,7 +58,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
private List<VCFRecord> filteredVcfrList;
private int numRecordsAttemptToMerge;
private int numRecordsWithinDistance;
private int numRecordsSatisfyingMergeRule;
private int numMergedRecords;
private AltAlleleStatsForSamples altAlleleStats = null;
@ -67,28 +67,28 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
// Should we call innerWriter.close() in close()
private boolean takeOwnershipOfInner;
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, MergeRule mergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
this.innerWriter = innerWriter;
this.genomeLocParser = genomeLocParser;
this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile);
this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP;
this.mergeRule = mergeRule;
this.useSingleSample = singleSample;
this.emitOnlyMergedRecords = emitOnlyMergedRecords;
this.vcfrWaitingToMerge = null;
this.filteredVcfrList = new LinkedList<VCFRecord>();
this.numRecordsWithinDistance = 0;
this.numRecordsSatisfyingMergeRule = 0;
this.numMergedRecords = 0;
if (trackAltAlleleStats)
this.altAlleleStats = new AltAlleleStatsForSamples(maxGenomicDistanceForMNP);
this.altAlleleStats = new AltAlleleStatsForSamples();
this.logger = logger;
this.takeOwnershipOfInner = takeOwnershipOfInner;
}
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) {
this(innerWriter, genomeLocParser, referenceFile, maxGenomicDistanceForMNP, null, false, logger, false, false); // by default: consider all samples, emit all records, don't own inner, don't keep track of alt allele statistics
this(innerWriter, genomeLocParser, referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser), null, false, logger, false, false); // by default: consider all samples, emit all records, don't own inner, don't keep track of alt allele statistics
}
public void writeHeader(VCFHeader header) {
@ -117,7 +117,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
return;
}
logger.debug("Next VC input = " + VariantContextUtils.getLocation(genomeLocParser,vc));
logger.debug("Next VC input = " + VariantContextUtils.getLocation(genomeLocParser, vc));
boolean curVcIsNotFiltered = vc.isNotFiltered();
if (vcfrWaitingToMerge == null) {
@ -127,26 +127,26 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
throw new ReviewedStingException("filteredVcfrList should be empty if not waiting to merge a vc!");
if (curVcIsNotFiltered) { // still need to wait before can release vc
logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(genomeLocParser,vc));
logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(genomeLocParser, vc));
vcfrWaitingToMerge = new VCFRecord(vc, refBase, false);
}
else if (!emitOnlyMergedRecords) { // filtered records are never merged
logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(genomeLocParser,vc));
logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(genomeLocParser, vc));
innerWriter.add(vc, refBase);
}
}
else { // waiting to merge vcfrWaitingToMerge
logger.debug("Waiting to merge " + VariantContextUtils.getLocation(genomeLocParser,vcfrWaitingToMerge.vc));
logger.debug("Waiting to merge " + VariantContextUtils.getLocation(genomeLocParser, vcfrWaitingToMerge.vc));
if (!curVcIsNotFiltered) {
if (!emitOnlyMergedRecords) { // filtered records are never merged
logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(genomeLocParser,vc));
logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(genomeLocParser, vc));
filteredVcfrList.add(new VCFRecord(vc, refBase, false));
}
}
else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them:
numRecordsAttemptToMerge++;
boolean mergeDistanceInRange = (minDistance(vcfrWaitingToMerge.vc, vc) <= maxGenomicDistanceForMNP);
boolean shouldMerge = mergeRule.shouldMerge(vcfrWaitingToMerge.vc, vc);
/*
TODO: -- CONSIDER THE FOLLOWING EXAMPLE: WHAT DO WE WANT HERE??? --
@ -163,14 +163,20 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
But, since we merged sites 1 and 2, we get that sites 1-2 and 3 are counted as two haplotypes of: ALT-REF and ALT-ALT
*/
if (altAlleleStats != null)
altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, mergeDistanceInRange);
altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, shouldMerge);
boolean mergedRecords = false;
if (mergeDistanceInRange) {
numRecordsWithinDistance++;
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser,vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
if (shouldMerge) {
numRecordsSatisfyingMergeRule++;
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
if (mergedVc != null) {
mergedRecords = true;
Map<String, Object> updatedAttribs = RefSeqData.getMergedRefSeqAttributes(vcfrWaitingToMerge.vc, vc);
updatedAttribs.putAll(mergedVc.getAttributes());
mergedVc = VariantContext.modifyAttributes(mergedVc, updatedAttribs);
vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true);
numMergedRecords++;
}
@ -205,16 +211,16 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
return numRecordsAttemptToMerge;
}
public int getNumRecordsWithinDistance() {
return numRecordsWithinDistance;
public int getNumRecordsSatisfyingMergeRule() {
return numRecordsSatisfyingMergeRule;
}
public int getNumMergedRecords() {
return numMergedRecords;
}
public int minDistance(VariantContext vc1, VariantContext vc2) {
return VariantContextUtils.getLocation(genomeLocParser,vc1).minDistance(VariantContextUtils.getLocation(genomeLocParser,vc2));
public MergeRule getMergeRule() {
return mergeRule;
}
/**
@ -248,7 +254,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
private class AltAlleleStats {
public int numSuccessiveGenotypes;
public int numSuccessiveGenotypesWithinDistance;
public int numSuccessiveGenotypesThatCouldBeMerged;
public int oneSampleMissing;
public int atLeastOneSampleNotCalledOrFiltered;
@ -267,7 +273,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
public AltAlleleStats() {
this.numSuccessiveGenotypes = 0;
this.numSuccessiveGenotypesWithinDistance = 0;
this.numSuccessiveGenotypesThatCouldBeMerged = 0;
this.oneSampleMissing = 0;
this.atLeastOneSampleNotCalledOrFiltered = 0;
@ -292,11 +298,11 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
sb.append("Not called or filtered:\t" + atLeastOneSampleNotCalledOrFiltered + "\n");
sb.append("* Number of successive pairs of genotypes:\t" + numSuccessiveGenotypes + "\n");
sb.append("Number of successive pairs of genotypes within distance:\t" + numSuccessiveGenotypesWithinDistance + "\n");
sb.append("Number of successive pairs of genotypes with " + mergeRule + ":\t" + numSuccessiveGenotypesThatCouldBeMerged + "\n");
sb.append("Unknown segregation, within distance:\t" + segregationUnknown + "\n");
sb.append("Not variant at least one of pair, segregation known, within distance:\t" + eitherNotVariant + "\n");
sb.append("* Variant at both, segregation known, within distance:\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n");
sb.append("Unknown segregation, " + mergeRule + ":\t" + segregationUnknown + "\n");
sb.append("Not variant at least one of pair, segregation known, " + mergeRule + ":\t" + eitherNotVariant + "\n");
sb.append("* Variant at both, segregation known, " + mergeRule + ":\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n");
sb.append("[Total haplotypes at pairs:\t" + (ref_ref_pair + ref_alt_pair + alt_ref_pair + alt_alt_pair) + "\n");
sb.append("REF-REF:\t" + ref_ref_pair + "\n");
@ -321,14 +327,12 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
private class AltAlleleStatsForSamples {
private Map<String, AltAlleleStats> sampleStats;
private int distance;
public AltAlleleStatsForSamples(int distance) {
public AltAlleleStatsForSamples() {
this.sampleStats = new HashMap<String, AltAlleleStats>();
this.distance = distance;
}
public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean mergeDistanceInRange) {
public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean shouldMerge) {
if (vc1.isFiltered() || vc2.isFiltered())
return;
@ -353,15 +357,15 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
else {
aas.numSuccessiveGenotypes++;
if (mergeDistanceInRange) {
aas.numSuccessiveGenotypesWithinDistance++;
if (shouldMerge) {
aas.numSuccessiveGenotypesThatCouldBeMerged++;
if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) {
aas.segregationUnknown++;
logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser,vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser,vc2));
logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2));
}
else if (gt1.isHomRef() || gt2.isHomRef()) {
logger.debug("gt1.isHomRef() || gt2.isHomRef() for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser,vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser,vc2));
logger.debug("gt1.isHomRef() || gt2.isHomRef() for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2));
aas.eitherNotVariant++;
}
else { // BOTH gt1 and gt2 have at least one variant allele (so either hets, or homozygous variant):
@ -390,7 +394,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
// Check MNPs vs. CHets:
if (containsRefAllele(site1Alleles) && containsRefAllele(site2Alleles)) {
logger.debug("HET-HET for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser,vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser,vc2));
logger.debug("HET-HET for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2));
if (logger.isDebugEnabled() && !(gt1.isHet() && gt2.isHet()))
throw new ReviewedStingException("Since !gt1.isHomRef() && !gt2.isHomRef(), yet both have ref alleles, they BOTH must be hets!");
@ -430,7 +434,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("-------------------------------------------------------------------------\n");
sb.append("Per-sample alternate allele statistics [Merge distance <= " + distance + "]\n");
sb.append("Per-sample alternate allele statistics [" + mergeRule + "]\n");
sb.append("-------------------------------------------------------------------------");
for (Map.Entry<String, AltAlleleStats> sampAltAllStatsEntry : sampleStats.entrySet()) {
@ -442,4 +446,133 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
return sb.toString();
}
}
}
/* Some methods for extracting and merging RefSeq-related data from annotated VCF INFO fields:
*/
class RefSeqData {
private static String REFSEQ_PREFIX = "refseq.";
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
private static String NAME_KEY = REFSEQ_PREFIX + "name";
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
private static String CODON_KEY = REFSEQ_PREFIX + "codonCoord";
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
Map<String, Object> vcAttribs = vc.getAttributes();
Map<String, String> entriesToNames = new HashMap<String, String>();
Integer numRecords = VariantContextUtils.getIntegerAttribute(vcAttribs, NUM_RECORDS_KEY);
if (numRecords != null) {
for (int i = 1; i <= numRecords; i++) {
String key = NAME_KEY + "_" + i;
String name = VariantContextUtils.getStringAttribute(vcAttribs, key);
if (name != null)
entriesToNames.put(key, name);
}
}
else {
String name = VariantContextUtils.getStringAttribute(vcAttribs, NAME_KEY);
if (name != null) {
entriesToNames.put(NAME_KEY, name);
}
else { // Check all INFO fields for a match:
for (Map.Entry<String, Object> entry : vcAttribs.entrySet()) {
String key = entry.getKey();
if (getName2 && key.startsWith(NAME2_KEY))
entriesToNames.put(key, entry.getValue().toString());
else if (key.startsWith(NAME_KEY) && !key.startsWith(NAME2_KEY))
entriesToNames.put(key, entry.getValue().toString());
}
}
}
return entriesToNames;
}
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
return getRefSeqEntriesToNames(vc, false);
}
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
}
public static Set<String> getRefSeqNames(VariantContext vc) {
return getRefSeqNames(vc, false);
}
public static Map<String, Object> getMergedRefSeqAttributes(VariantContext vc1, VariantContext vc2) {
Map<String, Object> refSeqAttribs = new HashMap<String, Object>();
List<RefSeqEntry> list1 = getAllRefSeqEntries(vc1);
List<RefSeqEntry> list2 = getAllRefSeqEntries(vc2);
boolean addSuffix = list1.size() > 1 || list2.size() > 1;
int count = 1;
for (RefSeqEntry refseq1 : list1) {
for (RefSeqEntry refseq2 : list2) {
Set<String> keys = new HashSet<String>();
keys.addAll(refseq1.info.keySet());
keys.addAll(refseq2.info.keySet());
String keySuffix = "";
if (addSuffix)
keySuffix = "_" + count++;
Object name1 = refseq1.info.get(NAME_KEY);
Object name2 = refseq2.info.get(NAME_KEY);
boolean sameGene = name1 != null && name2 != null && name1.equals(name2);
for (String key : keys) {
Object obj1 = refseq1.info.get(key);
Object obj2 = refseq2.info.get(key);
if (obj1 == null)
obj1 = "";
if (obj2 == null)
obj2 = "";
if (sameGene && key.equals(CODON_KEY) && obj1.equals(obj2)) // vc1 and vc2 have variants in the same codon in the same gene
System.out.println(vc1.getChr() + ":" + vc1.getStart() + " --> CODON: obj1 = " + obj1);
String useKey = key + keySuffix;
String mergedVal = obj1 + "\\" + obj2;
refSeqAttribs.put(useKey, mergedVal);
}
}
}
return refSeqAttribs;
}
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
String entry = entryToName.getKey();
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
}
return allRefSeq;
}
private static class RefSeqEntry {
public Map<String, Object> info;
public RefSeqEntry(VariantContext vc, String entrySuffix) {
this.info = new HashMap<String, Object>();
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
String key = attribEntry.getKey();
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
String genericKey = key.replaceAll(entrySuffix, "");
this.info.put(genericKey, attribEntry.getValue());
}
}
}
}
}

View File

@ -32,9 +32,12 @@ import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.util.*;
@ -67,6 +70,13 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
protected boolean disablePrintAlternateAlleleStatistics = false;
public final static String IGNORE_CODING = "IGNORE";
public final static String UNION_CODING = "UNION";
public final static String INTERSECT_CODING = "INTERSECT";
@Argument(fullName = "mergeBasedOnCodingAnnotation", shortName = "mergeBasedOnCodingAnnotation", doc = "'Should merging be performed if two sites lie on the same coding sequence in the INFO field {" + IGNORE_CODING + ", " + UNION_CODING + ", " + INTERSECT_CODING + "}; [default:"+ IGNORE_CODING + "]", required = false)
protected String mergeBasedOnCodingAnnotation = IGNORE_CODING;
private LinkedList<String> rodNames = null;
public void initialize() {
@ -77,8 +87,16 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
}
private void initializeVcfWriter() {
GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();
MergeRule mergeRule = null;
if (mergeBasedOnCodingAnnotation.equals(IGNORE_CODING))
mergeRule = new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser);
else
mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnCodingAnnotation);
// false <-> don't take control of writer, since didn't create it:
vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,getToolkit().getGenomeLocParser(),getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,genomeLocParser, getToolkit().getArguments().referenceFile, mergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
// setup the header fields:
@ -149,9 +167,84 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
if (useSingleSample != null)
System.out.println("Only considered single sample: " + useSingleSample);
System.out.println("Number of successive pairs of records (any distance): " + vcMergerWriter.getNumRecordsAttemptToMerge());
System.out.println("Number of potentially merged records (distance <= " + maxGenomicDistanceForMNP + "): " + vcMergerWriter.getNumRecordsWithinDistance());
System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
System.out.println("Number of potentially merged records (" + vcMergerWriter.getMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
System.out.println("Number of records merged [all samples are mergeable, some sample has a MNP of ALT alleles]: " + vcMergerWriter.getNumMergedRecords());
System.out.println(vcMergerWriter.getAltAlleleStats());
}
}
enum MergeBasedOnCodingAnnotation {
UNION_WITH_DIST, INTERSECT_WITH_DIST
}
interface MergeRule {
public boolean shouldMerge(VariantContext vc1, VariantContext vc2);
}
class DistanceMergeRule implements MergeRule {
private int maxGenomicDistanceForMNP;
private GenomeLocParser genomeLocParser;
public DistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser) {
this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP;
this.genomeLocParser = genomeLocParser;
}
public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
return minDistance(vc1, vc2) <= maxGenomicDistanceForMNP;
}
public String toString() {
return "Merge distance <= " + maxGenomicDistanceForMNP;
}
public int minDistance(VariantContext vc1, VariantContext vc2) {
return VariantContextUtils.getLocation(genomeLocParser,vc1).minDistance(VariantContextUtils.getLocation(genomeLocParser,vc2));
}
}
class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
private MergeBasedOnCodingAnnotation mergeBasedOnCodingAnnotation;
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnCodingAnnotation) {
super(maxGenomicDistanceForMNP, genomeLocParser);
if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_CODING))
this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.UNION_WITH_DIST;
else if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING))
this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST;
else
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_CODING + ", " + MergeSegregatingAlternateAllelesWalker.UNION_CODING + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING + " as argument to mergeBasedOnCodingAnnotation!");
}
public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
boolean withinDistance = super.shouldMerge(vc1, vc2);
if (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST)
return withinDistance || sameGene(vc1, vc2);
else // mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST
return withinDistance && sameGene(vc1, vc2);
}
private boolean sameGene(VariantContext vc1, VariantContext vc2) {
Set<String> names_vc1 = RefSeqData.getRefSeqNames(vc1);
Set<String> names_vc2 = RefSeqData.getRefSeqNames(vc2);
names_vc1.retainAll(names_vc2);
if (!names_vc1.isEmpty())
return true;
// Check refseq.name2:
Set<String> names2_vc1 = RefSeqData.getRefSeqNames(vc1, true);
Set<String> names2_vc2 = RefSeqData.getRefSeqNames(vc2, true);
names2_vc1.retainAll(names2_vc2);
return !names2_vc1.isEmpty();
}
public String toString() {
return super.toString() + " " + (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
}
}

View File

@ -138,7 +138,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
VCFWriter origWriter = writer;
if (enableMergePhasedSegregatingPolymorphismsToMNP) // null <-> use ALL samples, false <-> emit all records, false <-> don't track the statistics of alternate alleles being merged:
writer = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,getToolkit().getGenomeLocParser(),getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, null, false, logger, writer != origWriter, false);
writer = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, getToolkit().getGenomeLocParser()), null, false, logger, writer != origWriter, false);
/* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow
Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow