Added VERY preliminary version for merging refseq annotations as SNPs are merged
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4698 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e2f7f33ce7
commit
2f3578182a
|
|
@ -48,7 +48,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
private GenomeLocParser genomeLocParser;
|
private GenomeLocParser genomeLocParser;
|
||||||
|
|
||||||
private ReferenceSequenceFile referenceFileForMNPmerging;
|
private ReferenceSequenceFile referenceFileForMNPmerging;
|
||||||
private int maxGenomicDistanceForMNP;
|
private MergeRule mergeRule;
|
||||||
|
|
||||||
private String useSingleSample = null;
|
private String useSingleSample = null;
|
||||||
|
|
||||||
|
|
@ -58,7 +58,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
private List<VCFRecord> filteredVcfrList;
|
private List<VCFRecord> filteredVcfrList;
|
||||||
|
|
||||||
private int numRecordsAttemptToMerge;
|
private int numRecordsAttemptToMerge;
|
||||||
private int numRecordsWithinDistance;
|
private int numRecordsSatisfyingMergeRule;
|
||||||
private int numMergedRecords;
|
private int numMergedRecords;
|
||||||
private AltAlleleStatsForSamples altAlleleStats = null;
|
private AltAlleleStatsForSamples altAlleleStats = null;
|
||||||
|
|
||||||
|
|
@ -67,28 +67,28 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
// Should we call innerWriter.close() in close()
|
// Should we call innerWriter.close() in close()
|
||||||
private boolean takeOwnershipOfInner;
|
private boolean takeOwnershipOfInner;
|
||||||
|
|
||||||
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
|
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, MergeRule mergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
|
||||||
this.innerWriter = innerWriter;
|
this.innerWriter = innerWriter;
|
||||||
this.genomeLocParser = genomeLocParser;
|
this.genomeLocParser = genomeLocParser;
|
||||||
this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile);
|
this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile);
|
||||||
this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP;
|
this.mergeRule = mergeRule;
|
||||||
this.useSingleSample = singleSample;
|
this.useSingleSample = singleSample;
|
||||||
this.emitOnlyMergedRecords = emitOnlyMergedRecords;
|
this.emitOnlyMergedRecords = emitOnlyMergedRecords;
|
||||||
|
|
||||||
this.vcfrWaitingToMerge = null;
|
this.vcfrWaitingToMerge = null;
|
||||||
this.filteredVcfrList = new LinkedList<VCFRecord>();
|
this.filteredVcfrList = new LinkedList<VCFRecord>();
|
||||||
this.numRecordsWithinDistance = 0;
|
this.numRecordsSatisfyingMergeRule = 0;
|
||||||
this.numMergedRecords = 0;
|
this.numMergedRecords = 0;
|
||||||
|
|
||||||
if (trackAltAlleleStats)
|
if (trackAltAlleleStats)
|
||||||
this.altAlleleStats = new AltAlleleStatsForSamples(maxGenomicDistanceForMNP);
|
this.altAlleleStats = new AltAlleleStatsForSamples();
|
||||||
|
|
||||||
this.logger = logger;
|
this.logger = logger;
|
||||||
this.takeOwnershipOfInner = takeOwnershipOfInner;
|
this.takeOwnershipOfInner = takeOwnershipOfInner;
|
||||||
}
|
}
|
||||||
|
|
||||||
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) {
|
public MergePhasedSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, int maxGenomicDistanceForMNP, Logger logger) {
|
||||||
this(innerWriter, genomeLocParser, referenceFile, maxGenomicDistanceForMNP, null, false, logger, false, false); // by default: consider all samples, emit all records, don't own inner, don't keep track of alt allele statistics
|
this(innerWriter, genomeLocParser, referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser), null, false, logger, false, false); // by default: consider all samples, emit all records, don't own inner, don't keep track of alt allele statistics
|
||||||
}
|
}
|
||||||
|
|
||||||
public void writeHeader(VCFHeader header) {
|
public void writeHeader(VCFHeader header) {
|
||||||
|
|
@ -146,7 +146,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
}
|
}
|
||||||
else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them:
|
else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them:
|
||||||
numRecordsAttemptToMerge++;
|
numRecordsAttemptToMerge++;
|
||||||
boolean mergeDistanceInRange = (minDistance(vcfrWaitingToMerge.vc, vc) <= maxGenomicDistanceForMNP);
|
boolean shouldMerge = mergeRule.shouldMerge(vcfrWaitingToMerge.vc, vc);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
TODO: -- CONSIDER THE FOLLOWING EXAMPLE: WHAT DO WE WANT HERE??? --
|
TODO: -- CONSIDER THE FOLLOWING EXAMPLE: WHAT DO WE WANT HERE??? --
|
||||||
|
|
@ -163,14 +163,20 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
But, since we merged sites 1 and 2, we get that sites 1-2 and 3 are counted as two haplotypes of: ALT-REF and ALT-ALT
|
But, since we merged sites 1 and 2, we get that sites 1-2 and 3 are counted as two haplotypes of: ALT-REF and ALT-ALT
|
||||||
*/
|
*/
|
||||||
if (altAlleleStats != null)
|
if (altAlleleStats != null)
|
||||||
altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, mergeDistanceInRange);
|
altAlleleStats.updateSampleStats(vcfrWaitingToMerge.vc, vc, shouldMerge);
|
||||||
|
|
||||||
boolean mergedRecords = false;
|
boolean mergedRecords = false;
|
||||||
if (mergeDistanceInRange) {
|
if (shouldMerge) {
|
||||||
numRecordsWithinDistance++;
|
numRecordsSatisfyingMergeRule++;
|
||||||
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
|
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
|
||||||
|
|
||||||
if (mergedVc != null) {
|
if (mergedVc != null) {
|
||||||
mergedRecords = true;
|
mergedRecords = true;
|
||||||
|
|
||||||
|
Map<String, Object> updatedAttribs = RefSeqData.getMergedRefSeqAttributes(vcfrWaitingToMerge.vc, vc);
|
||||||
|
updatedAttribs.putAll(mergedVc.getAttributes());
|
||||||
|
mergedVc = VariantContext.modifyAttributes(mergedVc, updatedAttribs);
|
||||||
|
|
||||||
vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true);
|
vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true);
|
||||||
numMergedRecords++;
|
numMergedRecords++;
|
||||||
}
|
}
|
||||||
|
|
@ -205,16 +211,16 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
return numRecordsAttemptToMerge;
|
return numRecordsAttemptToMerge;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getNumRecordsWithinDistance() {
|
public int getNumRecordsSatisfyingMergeRule() {
|
||||||
return numRecordsWithinDistance;
|
return numRecordsSatisfyingMergeRule;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getNumMergedRecords() {
|
public int getNumMergedRecords() {
|
||||||
return numMergedRecords;
|
return numMergedRecords;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int minDistance(VariantContext vc1, VariantContext vc2) {
|
public MergeRule getMergeRule() {
|
||||||
return VariantContextUtils.getLocation(genomeLocParser,vc1).minDistance(VariantContextUtils.getLocation(genomeLocParser,vc2));
|
return mergeRule;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -248,7 +254,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
|
|
||||||
private class AltAlleleStats {
|
private class AltAlleleStats {
|
||||||
public int numSuccessiveGenotypes;
|
public int numSuccessiveGenotypes;
|
||||||
public int numSuccessiveGenotypesWithinDistance;
|
public int numSuccessiveGenotypesThatCouldBeMerged;
|
||||||
|
|
||||||
public int oneSampleMissing;
|
public int oneSampleMissing;
|
||||||
public int atLeastOneSampleNotCalledOrFiltered;
|
public int atLeastOneSampleNotCalledOrFiltered;
|
||||||
|
|
@ -267,7 +273,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
|
|
||||||
public AltAlleleStats() {
|
public AltAlleleStats() {
|
||||||
this.numSuccessiveGenotypes = 0;
|
this.numSuccessiveGenotypes = 0;
|
||||||
this.numSuccessiveGenotypesWithinDistance = 0;
|
this.numSuccessiveGenotypesThatCouldBeMerged = 0;
|
||||||
|
|
||||||
this.oneSampleMissing = 0;
|
this.oneSampleMissing = 0;
|
||||||
this.atLeastOneSampleNotCalledOrFiltered = 0;
|
this.atLeastOneSampleNotCalledOrFiltered = 0;
|
||||||
|
|
@ -292,11 +298,11 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
sb.append("Not called or filtered:\t" + atLeastOneSampleNotCalledOrFiltered + "\n");
|
sb.append("Not called or filtered:\t" + atLeastOneSampleNotCalledOrFiltered + "\n");
|
||||||
|
|
||||||
sb.append("* Number of successive pairs of genotypes:\t" + numSuccessiveGenotypes + "\n");
|
sb.append("* Number of successive pairs of genotypes:\t" + numSuccessiveGenotypes + "\n");
|
||||||
sb.append("Number of successive pairs of genotypes within distance:\t" + numSuccessiveGenotypesWithinDistance + "\n");
|
sb.append("Number of successive pairs of genotypes with " + mergeRule + ":\t" + numSuccessiveGenotypesThatCouldBeMerged + "\n");
|
||||||
|
|
||||||
sb.append("Unknown segregation, within distance:\t" + segregationUnknown + "\n");
|
sb.append("Unknown segregation, " + mergeRule + ":\t" + segregationUnknown + "\n");
|
||||||
sb.append("Not variant at least one of pair, segregation known, within distance:\t" + eitherNotVariant + "\n");
|
sb.append("Not variant at least one of pair, segregation known, " + mergeRule + ":\t" + eitherNotVariant + "\n");
|
||||||
sb.append("* Variant at both, segregation known, within distance:\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n");
|
sb.append("* Variant at both, segregation known, " + mergeRule + ":\t" + percentageString(bothInPairHaveVariant, numSuccessiveGenotypes) + "\n");
|
||||||
|
|
||||||
sb.append("[Total haplotypes at pairs:\t" + (ref_ref_pair + ref_alt_pair + alt_ref_pair + alt_alt_pair) + "\n");
|
sb.append("[Total haplotypes at pairs:\t" + (ref_ref_pair + ref_alt_pair + alt_ref_pair + alt_alt_pair) + "\n");
|
||||||
sb.append("REF-REF:\t" + ref_ref_pair + "\n");
|
sb.append("REF-REF:\t" + ref_ref_pair + "\n");
|
||||||
|
|
@ -321,14 +327,12 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
|
|
||||||
private class AltAlleleStatsForSamples {
|
private class AltAlleleStatsForSamples {
|
||||||
private Map<String, AltAlleleStats> sampleStats;
|
private Map<String, AltAlleleStats> sampleStats;
|
||||||
private int distance;
|
|
||||||
|
|
||||||
public AltAlleleStatsForSamples(int distance) {
|
public AltAlleleStatsForSamples() {
|
||||||
this.sampleStats = new HashMap<String, AltAlleleStats>();
|
this.sampleStats = new HashMap<String, AltAlleleStats>();
|
||||||
this.distance = distance;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean mergeDistanceInRange) {
|
public void updateSampleStats(VariantContext vc1, VariantContext vc2, boolean shouldMerge) {
|
||||||
if (vc1.isFiltered() || vc2.isFiltered())
|
if (vc1.isFiltered() || vc2.isFiltered())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
@ -353,8 +357,8 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
else {
|
else {
|
||||||
aas.numSuccessiveGenotypes++;
|
aas.numSuccessiveGenotypes++;
|
||||||
|
|
||||||
if (mergeDistanceInRange) {
|
if (shouldMerge) {
|
||||||
aas.numSuccessiveGenotypesWithinDistance++;
|
aas.numSuccessiveGenotypesThatCouldBeMerged++;
|
||||||
|
|
||||||
if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) {
|
if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) {
|
||||||
aas.segregationUnknown++;
|
aas.segregationUnknown++;
|
||||||
|
|
@ -430,7 +434,7 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
public String toString() {
|
public String toString() {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
sb.append("-------------------------------------------------------------------------\n");
|
sb.append("-------------------------------------------------------------------------\n");
|
||||||
sb.append("Per-sample alternate allele statistics [Merge distance <= " + distance + "]\n");
|
sb.append("Per-sample alternate allele statistics [" + mergeRule + "]\n");
|
||||||
sb.append("-------------------------------------------------------------------------");
|
sb.append("-------------------------------------------------------------------------");
|
||||||
|
|
||||||
for (Map.Entry<String, AltAlleleStats> sampAltAllStatsEntry : sampleStats.entrySet()) {
|
for (Map.Entry<String, AltAlleleStats> sampAltAllStatsEntry : sampleStats.entrySet()) {
|
||||||
|
|
@ -443,3 +447,132 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Some methods for extracting and merging RefSeq-related data from annotated VCF INFO fields:
|
||||||
|
*/
|
||||||
|
|
||||||
|
class RefSeqData {
|
||||||
|
private static String REFSEQ_PREFIX = "refseq.";
|
||||||
|
|
||||||
|
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
|
||||||
|
private static String NAME_KEY = REFSEQ_PREFIX + "name";
|
||||||
|
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
|
||||||
|
|
||||||
|
private static String CODON_KEY = REFSEQ_PREFIX + "codonCoord";
|
||||||
|
|
||||||
|
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
|
||||||
|
Map<String, Object> vcAttribs = vc.getAttributes();
|
||||||
|
Map<String, String> entriesToNames = new HashMap<String, String>();
|
||||||
|
|
||||||
|
Integer numRecords = VariantContextUtils.getIntegerAttribute(vcAttribs, NUM_RECORDS_KEY);
|
||||||
|
if (numRecords != null) {
|
||||||
|
for (int i = 1; i <= numRecords; i++) {
|
||||||
|
String key = NAME_KEY + "_" + i;
|
||||||
|
String name = VariantContextUtils.getStringAttribute(vcAttribs, key);
|
||||||
|
if (name != null)
|
||||||
|
entriesToNames.put(key, name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
String name = VariantContextUtils.getStringAttribute(vcAttribs, NAME_KEY);
|
||||||
|
if (name != null) {
|
||||||
|
entriesToNames.put(NAME_KEY, name);
|
||||||
|
}
|
||||||
|
else { // Check all INFO fields for a match:
|
||||||
|
for (Map.Entry<String, Object> entry : vcAttribs.entrySet()) {
|
||||||
|
String key = entry.getKey();
|
||||||
|
if (getName2 && key.startsWith(NAME2_KEY))
|
||||||
|
entriesToNames.put(key, entry.getValue().toString());
|
||||||
|
else if (key.startsWith(NAME_KEY) && !key.startsWith(NAME2_KEY))
|
||||||
|
entriesToNames.put(key, entry.getValue().toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return entriesToNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
|
||||||
|
return getRefSeqEntriesToNames(vc, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
|
||||||
|
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<String> getRefSeqNames(VariantContext vc) {
|
||||||
|
return getRefSeqNames(vc, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Map<String, Object> getMergedRefSeqAttributes(VariantContext vc1, VariantContext vc2) {
|
||||||
|
Map<String, Object> refSeqAttribs = new HashMap<String, Object>();
|
||||||
|
|
||||||
|
List<RefSeqEntry> list1 = getAllRefSeqEntries(vc1);
|
||||||
|
List<RefSeqEntry> list2 = getAllRefSeqEntries(vc2);
|
||||||
|
boolean addSuffix = list1.size() > 1 || list2.size() > 1;
|
||||||
|
int count = 1;
|
||||||
|
|
||||||
|
for (RefSeqEntry refseq1 : list1) {
|
||||||
|
for (RefSeqEntry refseq2 : list2) {
|
||||||
|
Set<String> keys = new HashSet<String>();
|
||||||
|
keys.addAll(refseq1.info.keySet());
|
||||||
|
keys.addAll(refseq2.info.keySet());
|
||||||
|
|
||||||
|
String keySuffix = "";
|
||||||
|
if (addSuffix)
|
||||||
|
keySuffix = "_" + count++;
|
||||||
|
|
||||||
|
Object name1 = refseq1.info.get(NAME_KEY);
|
||||||
|
Object name2 = refseq2.info.get(NAME_KEY);
|
||||||
|
boolean sameGene = name1 != null && name2 != null && name1.equals(name2);
|
||||||
|
|
||||||
|
for (String key : keys) {
|
||||||
|
Object obj1 = refseq1.info.get(key);
|
||||||
|
Object obj2 = refseq2.info.get(key);
|
||||||
|
if (obj1 == null)
|
||||||
|
obj1 = "";
|
||||||
|
if (obj2 == null)
|
||||||
|
obj2 = "";
|
||||||
|
|
||||||
|
if (sameGene && key.equals(CODON_KEY) && obj1.equals(obj2)) // vc1 and vc2 have variants in the same codon in the same gene
|
||||||
|
System.out.println(vc1.getChr() + ":" + vc1.getStart() + " --> CODON: obj1 = " + obj1);
|
||||||
|
|
||||||
|
String useKey = key + keySuffix;
|
||||||
|
String mergedVal = obj1 + "\\" + obj2;
|
||||||
|
refSeqAttribs.put(useKey, mergedVal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return refSeqAttribs;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
|
||||||
|
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
|
||||||
|
|
||||||
|
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
|
||||||
|
String entry = entryToName.getKey();
|
||||||
|
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
|
||||||
|
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
|
||||||
|
}
|
||||||
|
|
||||||
|
return allRefSeq;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class RefSeqEntry {
|
||||||
|
public Map<String, Object> info;
|
||||||
|
|
||||||
|
public RefSeqEntry(VariantContext vc, String entrySuffix) {
|
||||||
|
this.info = new HashMap<String, Object>();
|
||||||
|
|
||||||
|
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
|
||||||
|
String key = attribEntry.getKey();
|
||||||
|
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
|
||||||
|
String genericKey = key.replaceAll(entrySuffix, "");
|
||||||
|
this.info.put(genericKey, attribEntry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -32,9 +32,12 @@ import org.broadinstitute.sting.commandline.Hidden;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -67,6 +70,13 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
|
||||||
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
|
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
|
||||||
protected boolean disablePrintAlternateAlleleStatistics = false;
|
protected boolean disablePrintAlternateAlleleStatistics = false;
|
||||||
|
|
||||||
|
public final static String IGNORE_CODING = "IGNORE";
|
||||||
|
public final static String UNION_CODING = "UNION";
|
||||||
|
public final static String INTERSECT_CODING = "INTERSECT";
|
||||||
|
|
||||||
|
@Argument(fullName = "mergeBasedOnCodingAnnotation", shortName = "mergeBasedOnCodingAnnotation", doc = "'Should merging be performed if two sites lie on the same coding sequence in the INFO field {" + IGNORE_CODING + ", " + UNION_CODING + ", " + INTERSECT_CODING + "}; [default:"+ IGNORE_CODING + "]", required = false)
|
||||||
|
protected String mergeBasedOnCodingAnnotation = IGNORE_CODING;
|
||||||
|
|
||||||
private LinkedList<String> rodNames = null;
|
private LinkedList<String> rodNames = null;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
@ -77,8 +87,16 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initializeVcfWriter() {
|
private void initializeVcfWriter() {
|
||||||
|
GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();
|
||||||
|
|
||||||
|
MergeRule mergeRule = null;
|
||||||
|
if (mergeBasedOnCodingAnnotation.equals(IGNORE_CODING))
|
||||||
|
mergeRule = new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser);
|
||||||
|
else
|
||||||
|
mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnCodingAnnotation);
|
||||||
|
|
||||||
// false <-> don't take control of writer, since didn't create it:
|
// false <-> don't take control of writer, since didn't create it:
|
||||||
vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,getToolkit().getGenomeLocParser(),getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
|
vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,genomeLocParser, getToolkit().getArguments().referenceFile, mergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
|
||||||
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
|
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
|
||||||
|
|
||||||
// setup the header fields:
|
// setup the header fields:
|
||||||
|
|
@ -149,9 +167,84 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
|
||||||
if (useSingleSample != null)
|
if (useSingleSample != null)
|
||||||
System.out.println("Only considered single sample: " + useSingleSample);
|
System.out.println("Only considered single sample: " + useSingleSample);
|
||||||
|
|
||||||
System.out.println("Number of successive pairs of records (any distance): " + vcMergerWriter.getNumRecordsAttemptToMerge());
|
System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
|
||||||
System.out.println("Number of potentially merged records (distance <= " + maxGenomicDistanceForMNP + "): " + vcMergerWriter.getNumRecordsWithinDistance());
|
System.out.println("Number of potentially merged records (" + vcMergerWriter.getMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
|
||||||
System.out.println("Number of records merged [all samples are mergeable, some sample has a MNP of ALT alleles]: " + vcMergerWriter.getNumMergedRecords());
|
System.out.println("Number of records merged [all samples are mergeable, some sample has a MNP of ALT alleles]: " + vcMergerWriter.getNumMergedRecords());
|
||||||
System.out.println(vcMergerWriter.getAltAlleleStats());
|
System.out.println(vcMergerWriter.getAltAlleleStats());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
enum MergeBasedOnCodingAnnotation {
|
||||||
|
UNION_WITH_DIST, INTERSECT_WITH_DIST
|
||||||
|
}
|
||||||
|
|
||||||
|
interface MergeRule {
|
||||||
|
public boolean shouldMerge(VariantContext vc1, VariantContext vc2);
|
||||||
|
}
|
||||||
|
|
||||||
|
class DistanceMergeRule implements MergeRule {
|
||||||
|
private int maxGenomicDistanceForMNP;
|
||||||
|
private GenomeLocParser genomeLocParser;
|
||||||
|
|
||||||
|
public DistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser) {
|
||||||
|
this.maxGenomicDistanceForMNP = maxGenomicDistanceForMNP;
|
||||||
|
this.genomeLocParser = genomeLocParser;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
|
||||||
|
return minDistance(vc1, vc2) <= maxGenomicDistanceForMNP;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "Merge distance <= " + maxGenomicDistanceForMNP;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int minDistance(VariantContext vc1, VariantContext vc2) {
|
||||||
|
return VariantContextUtils.getLocation(genomeLocParser,vc1).minDistance(VariantContextUtils.getLocation(genomeLocParser,vc2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
|
||||||
|
private MergeBasedOnCodingAnnotation mergeBasedOnCodingAnnotation;
|
||||||
|
|
||||||
|
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnCodingAnnotation) {
|
||||||
|
super(maxGenomicDistanceForMNP, genomeLocParser);
|
||||||
|
|
||||||
|
if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_CODING))
|
||||||
|
this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.UNION_WITH_DIST;
|
||||||
|
else if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING))
|
||||||
|
this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST;
|
||||||
|
else
|
||||||
|
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_CODING + ", " + MergeSegregatingAlternateAllelesWalker.UNION_CODING + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING + " as argument to mergeBasedOnCodingAnnotation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
|
||||||
|
boolean withinDistance = super.shouldMerge(vc1, vc2);
|
||||||
|
|
||||||
|
if (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST)
|
||||||
|
return withinDistance || sameGene(vc1, vc2);
|
||||||
|
else // mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST
|
||||||
|
return withinDistance && sameGene(vc1, vc2);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean sameGene(VariantContext vc1, VariantContext vc2) {
|
||||||
|
Set<String> names_vc1 = RefSeqData.getRefSeqNames(vc1);
|
||||||
|
Set<String> names_vc2 = RefSeqData.getRefSeqNames(vc2);
|
||||||
|
names_vc1.retainAll(names_vc2);
|
||||||
|
|
||||||
|
if (!names_vc1.isEmpty())
|
||||||
|
return true;
|
||||||
|
|
||||||
|
// Check refseq.name2:
|
||||||
|
Set<String> names2_vc1 = RefSeqData.getRefSeqNames(vc1, true);
|
||||||
|
Set<String> names2_vc2 = RefSeqData.getRefSeqNames(vc2, true);
|
||||||
|
names2_vc1.retainAll(names2_vc2);
|
||||||
|
|
||||||
|
return !names2_vc1.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return super.toString() + " " + (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -138,7 +138,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
||||||
VCFWriter origWriter = writer;
|
VCFWriter origWriter = writer;
|
||||||
|
|
||||||
if (enableMergePhasedSegregatingPolymorphismsToMNP) // null <-> use ALL samples, false <-> emit all records, false <-> don't track the statistics of alternate alleles being merged:
|
if (enableMergePhasedSegregatingPolymorphismsToMNP) // null <-> use ALL samples, false <-> emit all records, false <-> don't track the statistics of alternate alleles being merged:
|
||||||
writer = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,getToolkit().getGenomeLocParser(),getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, null, false, logger, writer != origWriter, false);
|
writer = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, new DistanceMergeRule(maxGenomicDistanceForMNP, getToolkit().getGenomeLocParser()), null, false, logger, writer != origWriter, false);
|
||||||
|
|
||||||
/* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow
|
/* Due to discardIrrelevantPhasedSites(), the startDistance spanned by [partiallyPhasedSites.peek(), unphasedSiteQueue.peek()] is <= cacheWindow
|
||||||
Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow
|
Due to processQueue(), the startDistance spanned by [unphasedSiteQueue.peek(), mostDownstreamLocusReached] is <= cacheWindow
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue