Moved many unused phasing walkers and utilities to archive

This commit is contained in:
Mark DePristo 2011-11-15 16:14:50 -05:00
parent 460a51f473
commit 2b2514dad2
20 changed files with 411 additions and 985 deletions

View File

@ -29,7 +29,7 @@ import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
public abstract class BaseArray implements Comparable<BaseArray> {
abstract class BaseArray implements Comparable<BaseArray> {
protected Byte[] bases;
public BaseArray(byte[] bases) {

View File

@ -30,7 +30,7 @@ import java.util.Iterator;
/*
* CardinalityCounter object allows user to iterate over all assignment of arbitrary-cardinality variables.
*/
public class CardinalityCounter implements Iterator<int[]>, Iterable<int[]> {
class CardinalityCounter implements Iterator<int[]>, Iterable<int[]> {
private int[] cards;
private int[] valList;
private boolean hasNext;

View File

@ -30,7 +30,7 @@ import java.util.NoSuchElementException;
It is UNIQUE in the fact that its iterator (BidirectionalIterator) can be cloned
to save the current pointer for a later time (while the original iterator can continue to iterate).
*/
public class CloneableIteratorLinkedList<E> {
class CloneableIteratorLinkedList<E> {
private CloneableIteratorDoublyLinkedNode<E> first;
private CloneableIteratorDoublyLinkedNode<E> last;
private int size;

View File

@ -21,13 +21,13 @@
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
package org.broadinstitute.sting.gatk.walkers.phasing;
import java.util.Collection;
import java.util.Set;
import java.util.TreeSet;
public class DisjointSet {
class DisjointSet {
private ItemNode[] nodes;
public DisjointSet(int numItems) {

View File

@ -27,7 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
public class Haplotype extends BaseArray implements Cloneable {
class Haplotype extends BaseArray implements Cloneable {
public Haplotype(byte[] bases) {
super(bases);
}

View File

@ -1,133 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
/**
* Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD.
*/
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE})
@By(DataSource.REFERENCE_ORDERED_DATA)
public class MergeMNPsWalker extends RodWalker<Integer, Integer> {
@Output(doc = "File to which variants should be written", required = true)
protected VCFWriter writer = null;
private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
protected int maxGenomicDistanceForMNP = 1;
@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
public RodBinding<VariantContext> variants;
public void initialize() {
initializeVcfWriter();
}
private void initializeVcfWriter() {
// false <-> don't take control of writer, since didn't create it:
vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, getToolkit().getGenomeLocParser(), getToolkit().getArguments().referenceFile, maxGenomicDistanceForMNP, logger, false);
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
// setup the header fields:
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName()));
vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(variants.getName()).getGenotypeSamples())));
}
public boolean generateExtendedEvents() {
return false;
}
public Integer reduceInit() {
return 0;
}
/**
* For each site, send it to be (possibly) merged with previously observed sites.
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return dummy Integer
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (tracker == null)
return null;
for (VariantContext vc : tracker.getValues(variants, context.getLocation()))
writeVCF(vc);
return 0;
}
private void writeVCF(VariantContext vc) {
WriteVCF.writeVCF(vc, vcMergerWriter, logger);
}
public Integer reduce(Integer result, Integer total) {
if (result == null)
return total;
return total + result;
}
/**
* Release any VariantContexts not yet processed.
*
* @param result Empty for now...
*/
public void onTraversalDone(Integer result) {
vcMergerWriter.close();
System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords());
System.out.println(vcMergerWriter.getAltAlleleStats());
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010, The Broad Institute
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -44,7 +44,7 @@ import java.util.*;
// Streams in VariantContext objects and streams out VariantContexts produced by merging phased segregating polymorphisms into MNP VariantContexts
public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
private VCFWriter innerWriter;
private GenomeLocParser genomeLocParser;
@ -52,7 +52,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
private ReferenceSequenceFile referenceFileForMNPmerging;
private VariantContextMergeRule vcMergeRule;
private VariantContextUtils.AlleleMergeRule alleleMergeRule;
private PhasingUtils.AlleleMergeRule alleleMergeRule;
private String useSingleSample = null;
@ -71,7 +71,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
// Should we call innerWriter.close() in close()
private boolean takeOwnershipOfInner;
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, PhasingUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
this.innerWriter = innerWriter;
this.genomeLocParser = genomeLocParser;
try {
@ -179,7 +179,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
boolean mergedRecords = false;
if (shouldAttemptToMerge) {
numRecordsSatisfyingMergeRule++;
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule);
VariantContext mergedVc = PhasingUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging, alleleMergeRule);
if (mergedVc != null) {
mergedRecords = true;
@ -218,26 +218,6 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
filteredVcfrList.clear();
}
public int getNumRecordsAttemptToMerge() {
return numRecordsAttemptToMerge;
}
public int getNumRecordsSatisfyingMergeRule() {
return numRecordsSatisfyingMergeRule;
}
public int getNumMergedRecords() {
return numMergedRecords;
}
public VariantContextMergeRule getVcMergeRule() {
return vcMergeRule;
}
public VariantContextUtils.AlleleMergeRule getAlleleMergeRule() {
return alleleMergeRule;
}
/**
* Gets a string representation of this object.
*
@ -248,13 +228,6 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
return getClass().getName();
}
public String getAltAlleleStats() {
if (altAlleleStats == null)
return "";
return "\n" + altAlleleStats.toString();
}
private static class VCFRecord {
public VariantContext vc;
public boolean resultedFromMerge;
@ -373,7 +346,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
if (shouldAttemptToMerge) {
aas.numSuccessiveGenotypesAttemptedToBeMerged++;
if (!VariantContextUtils.alleleSegregationIsKnown(gt1, gt2)) {
if (!PhasingUtils.alleleSegregationIsKnown(gt1, gt2)) {
aas.segregationUnknown++;
logger.debug("Unknown segregation of alleles [not phased] for " + samp + " at " + VariantContextUtils.getLocation(genomeLocParser, vc1) + ", " + VariantContextUtils.getLocation(genomeLocParser, vc2));
}
@ -498,9 +471,9 @@ class DistanceMergeRule extends VariantContextMergeRule {
}
class ExistsDoubleAltAlleleMergeRule extends VariantContextUtils.AlleleMergeRule {
class ExistsDoubleAltAlleleMergeRule extends PhasingUtils.AlleleMergeRule {
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
return VariantContextUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2);
return PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2);
}
public String toString() {
@ -515,7 +488,7 @@ class SegregatingMNPmergeAllelesRule extends ExistsDoubleAltAlleleMergeRule {
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
// Must be interesting AND consistent:
return super.allelesShouldBeMerged(vc1, vc2) && VariantContextUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2);
return super.allelesShouldBeMerged(vc1, vc2) && PhasingUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc2);
}
public String toString() {

View File

@ -1,236 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
/**
* Walks along all variant ROD loci, and merges consecutive sites if some sample has segregating alt alleles in the ROD.
*/
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE})
@By(DataSource.REFERENCE_ORDERED_DATA)
public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, Integer> {
@Output(doc = "File to which variants should be written", required = true)
protected VCFWriter writer = null;
private MergeSegregatingAlternateAllelesVCFWriter vcMergerWriter = null;
@Argument(fullName = "maxGenomicDistance", shortName = "maxDist", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records; [default:1]", required = false)
protected int maxGenomicDistance = 1;
@Argument(fullName = "useSingleSample", shortName = "useSample", doc = "Only output genotypes for the single sample given; [default:use all samples]", required = false)
protected String useSingleSample = null;
@Hidden
@Argument(fullName = "emitOnlyMergedRecords", shortName = "emitOnlyMerged", doc = "Only output records that resulted from merging [For DEBUGGING purposes only - DO NOT USE, since it disregards the semantics of '|' as 'phased relative to previous non-filtered VC']; [default:false]", required = false)
protected boolean emitOnlyMergedRecords = false;
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
protected boolean disablePrintAlternateAlleleStatistics = false;
public final static String IGNORE_REFSEQ = "IGNORE";
public final static String UNION_REFSEQ = "UNION";
public final static String INTERSECT_REFSEQ = "INTERSECT";
@Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:" + IGNORE_REFSEQ + "]", required = false)
protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ;
@Argument(fullName = "dontRequireSomeSampleHasDoubleAltAllele", shortName = "dontRequireSomeSampleHasDoubleAltAllele", doc = "Should the requirement, that SUCCESSIVE records to be merged have at least one sample with a double alternate allele, be relaxed?; [default:false]", required = false)
protected boolean dontRequireSomeSampleHasDoubleAltAllele = false;
@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
public RodBinding<VariantContext> variants;
public void initialize() {
initializeVcfWriter();
}
private void initializeVcfWriter() {
GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();
VariantContextMergeRule vcMergeRule;
if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ))
vcMergeRule = new DistanceMergeRule(maxGenomicDistance, genomeLocParser);
else
vcMergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistance, genomeLocParser, mergeBasedOnRefSeqAnnotation);
VariantContextUtils.AlleleMergeRule alleleMergeRule;
if (dontRequireSomeSampleHasDoubleAltAllele) // if a pair of VariantContext passes the vcMergeRule, then always merge them if there is a trailing prefix of polymorphisms (i.e., upstream polymorphic site):
alleleMergeRule = new PrefixPolymorphismMergeAllelesRule();
else
alleleMergeRule = new ExistsDoubleAltAlleleMergeRule();
// false <-> don't take control of writer, since didn't create it:
vcMergerWriter = new MergeSegregatingAlternateAllelesVCFWriter(writer, genomeLocParser, getToolkit().getArguments().referenceFile, vcMergeRule, alleleMergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
writer = null; // so it can't be accessed directly [i.e., not through vcMergerWriter]
// setup the header fields:
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName()));
vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(variants.getName()).getGenotypeSamples())));
}
public boolean generateExtendedEvents() {
return false;
}
public Integer reduceInit() {
return 0;
}
/**
* For each site, send it to be (possibly) merged with previously observed sites.
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return dummy Integer
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (tracker == null)
return null;
for (VariantContext vc : tracker.getValues(variants, context.getLocation()))
writeVCF(vc);
return 0;
}
private void writeVCF(VariantContext vc) {
WriteVCF.writeVCF(vc, vcMergerWriter, logger);
}
public Integer reduce(Integer result, Integer total) {
if (result == null)
return total;
return total + result;
}
/**
* Release any VariantContexts not yet processed.
*
* @param result Empty for now...
*/
public void onTraversalDone(Integer result) {
vcMergerWriter.close();
if (useSingleSample != null)
System.out.println("Only considered single sample: " + useSingleSample);
System.out.println("Number of successive pairs of records: " + vcMergerWriter.getNumRecordsAttemptToMerge());
System.out.println("Number of potentially merged records (" + vcMergerWriter.getVcMergeRule() + "): " + vcMergerWriter.getNumRecordsSatisfyingMergeRule());
System.out.println("Number of records merged ("+ vcMergerWriter.getAlleleMergeRule() + "): " + vcMergerWriter.getNumMergedRecords());
System.out.println(vcMergerWriter.getAltAlleleStats());
}
}
enum MergeBasedOnRefSeqAnnotation {
UNION_WITH_DIST, INTERSECT_WITH_DIST
}
class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation;
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnRefSeqAnnotation) {
super(maxGenomicDistanceForMNP, genomeLocParser);
if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ))
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST;
else if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ))
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST;
else
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!");
}
public boolean shouldAttemptToMerge(VariantContext vc1, VariantContext vc2) {
boolean withinDistance = super.shouldAttemptToMerge(vc1, vc2);
if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST)
return withinDistance || sameGene(vc1, vc2);
else // mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST
return withinDistance && sameGene(vc1, vc2);
}
private boolean sameGene(VariantContext vc1, VariantContext vc2) {
Set<String> names_vc1 = RefSeqDataParser.getRefSeqNames(vc1);
Set<String> names_vc2 = RefSeqDataParser.getRefSeqNames(vc2);
names_vc1.retainAll(names_vc2);
if (!names_vc1.isEmpty())
return true;
// Check refseq.name2:
Set<String> names2_vc1 = RefSeqDataParser.getRefSeqNames(vc1, true);
Set<String> names2_vc2 = RefSeqDataParser.getRefSeqNames(vc2, true);
names2_vc1.retainAll(names2_vc2);
return !names2_vc1.isEmpty();
}
public String toString() {
return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
}
public Map<String, Object> addToMergedAttributes(VariantContext vc1, VariantContext vc2) {
Map<String, Object> addedAttribs = super.addToMergedAttributes(vc1, vc2);
addedAttribs.putAll(RefSeqDataParser.getMergedRefSeqNameAttributes(vc1, vc2));
return addedAttribs;
}
}
class PrefixPolymorphismMergeAllelesRule extends VariantContextUtils.AlleleMergeRule {
public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) {
return vc1.isPolymorphic();
}
public String toString() {
return super.toString() + ", there exists a polymorphism at the start of the merged allele";
}
}

View File

@ -23,12 +23,10 @@
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.utils.DisjointSet;
import java.util.*;
// Represents an undirected graph with no self-edges:
public class PhasingGraph implements Iterable<PhasingGraphEdge> {
class PhasingGraph implements Iterable<PhasingGraphEdge> {
private Neighbors[] adj;
public PhasingGraph(int numVertices) {

View File

@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
/*
Edge class for PhasingGraph
*/
public class PhasingGraphEdge implements Comparable<PhasingGraphEdge> {
class PhasingGraphEdge implements Comparable<PhasingGraphEdge> {
protected int v1;
protected int v2;

View File

@ -29,7 +29,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
public class PhasingRead extends BaseArray {
class PhasingRead extends BaseArray {
private PreciseNonNegativeDouble mappingProb; // the probability that this read is mapped correctly
private PreciseNonNegativeDouble[] baseProbs; // the probabilities that the base identities are CORRECT
private PreciseNonNegativeDouble[] baseErrorProbs; // the probabilities that the base identities are INCORRECT

View File

@ -0,0 +1,387 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
/**
* [Short one sentence description of this walker]
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h2>Input</h2>
* <p>
* [Input description]
* </p>
* <p/>
* <h2>Output</h2>
* <p>
* [Output description]
* </p>
* <p/>
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/
class PhasingUtils {
static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) {
if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2))
return null;
// Check that it's logically possible to merge the VCs:
if (!allSamplesAreMergeable(vc1, vc2))
return null;
// Check if there's a "point" in merging the VCs (e.g., annotations could be changed)
if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2))
return null;
return reallyMergeIntoMNP(vc1, vc2, referenceFile);
}
static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
int startInter = vc1.getEnd() + 1;
int endInter = vc2.getStart() - 1;
byte[] intermediateBases = null;
if (startInter <= endInter) {
intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases();
StringUtil.toUpperCase(intermediateBases);
}
MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added
GenotypeCollection mergedGenotypes = GenotypeCollection.create();
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
List<Allele> mergedAllelesForSample = new LinkedList<Allele>();
/* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records,
we preserve phase information (if any) relative to whatever precedes vc1:
*/
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2);
mergedAllelesForSample.add(mergedAllele);
}
double mergedGQ = Math.max(gt1.getNegLog10PError(), gt2.getNegLog10PError());
Set<String> mergedGtFilters = new HashSet<String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered
Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
if (phaseQual.PQ != null)
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ);
Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased);
mergedGenotypes.add(mergedGt);
}
String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource());
double mergedNegLog10PError = Math.max(vc1.getNegLog10PError(), vc2.getNegLog10PError());
Set<String> mergedFilters = new HashSet<String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered
Map<String, Object> mergedAttribs = mergeVariantContextAttributes(vc1, vc2);
// ids
List<String> mergedIDs = new ArrayList<String>();
if ( vc1.hasID() ) mergedIDs.add(vc1.getID());
if ( vc2.hasID() ) mergedIDs.add(vc2.getID());
String mergedID = Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs);
// TODO -- FIX ID
VariantContext mergedVc = new VariantContext(mergedName, mergedID, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs);
mergedAttribs = new HashMap<String, Object>(mergedVc.getAttributes());
VariantContextUtils.calculateChromosomeCounts(mergedVc, mergedAttribs, true);
mergedVc = VariantContext.modifyAttributes(mergedVc, mergedAttribs);
return mergedVc;
}
static String mergeVariantContextNames(String name1, String name2) {
return name1 + "_" + name2;
}
static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) {
Map<String, Object> mergedAttribs = new HashMap<String, Object>();
List<VariantContext> vcList = new LinkedList<VariantContext>();
vcList.add(vc1);
vcList.add(vc2);
String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY};
for (String orAttrib : MERGE_OR_ATTRIBS) {
boolean attribVal = false;
for (VariantContext vc : vcList) {
attribVal = vc.getAttributeAsBoolean(orAttrib, false);
if (attribVal) // already true, so no reason to continue:
break;
}
mergedAttribs.put(orAttrib, attribVal);
}
return mergedAttribs;
}
static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) {
GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1);
GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2);
if (!loc1.onSameContig(loc2))
throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome");
if (!loc1.isBefore(loc2))
throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2");
if (vc1.isFiltered() || vc2.isFiltered())
return false;
if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets
return false;
if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2))
return false;
return true;
}
static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) {
for (final Genotype gt : vc.getGenotypes()) {
if (gt.isNoCall() || gt.isFiltered())
return false;
}
return true;
}
static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) {
// Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1:
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom
return false;
}
return true;
}
static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
if (gt1.getPloidy() != gt2.getPloidy())
return false;
/* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard].
HOWEVER, EVEN if this is not the case, but gt1.isHom(),
it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1.
*/
return (gt2.isPhased() || gt2.isHom() || gt1.isHom());
}
static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) {
if (gt2.isPhased() || gt2.isHom())
return new PhaseAndQuality(gt1); // maintain the phase of gt1
if (!gt1.isHom())
throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()");
/* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype
For example, if we're merging the third Genotype with the second one:
0/1
1|1
0/1
Then, we want to output:
0/1
1/2
*/
return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()]
}
static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate
return true;
}
}
return false;
}
static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) {
// Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference):
Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();
// Note the segregation of the alleles for the reference genome:
allele1ToAllele2.put(vc1.getReference(), vc2.getReference());
allele2ToAllele1.put(vc2.getReference(), vc1.getReference());
// Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples).
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next();
Allele all1To2 = allele1ToAllele2.get(all1);
if (all1To2 == null)
allele1ToAllele2.put(all1, all2);
else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2
return false;
Allele all2To1 = allele2ToAllele1.get(all2);
if (all2To1 == null)
allele2ToAllele1.put(all2, all1);
else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1
return false;
}
}
return true;
}
abstract static class AlleleMergeRule {
// vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2):
abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2);
public String toString() {
return "all samples are mergeable";
}
}
static class AlleleOneAndTwo {
private Allele all1;
private Allele all2;
public AlleleOneAndTwo(Allele all1, Allele all2) {
this.all1 = all1;
this.all2 = all2;
}
public int hashCode() {
return all1.hashCode() + all2.hashCode();
}
public boolean equals(Object other) {
if (!(other instanceof AlleleOneAndTwo))
return false;
AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other;
return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2));
}
}
static class MergedAllelesData {
private Map<AlleleOneAndTwo, Allele> mergedAlleles;
private byte[] intermediateBases;
private int intermediateLength;
public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) {
this.mergedAlleles = new HashMap<AlleleOneAndTwo, Allele>(); // implemented equals() and hashCode() for AlleleOneAndTwo
this.intermediateBases = intermediateBases;
this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0;
this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true);
}
public Allele ensureMergedAllele(Allele all1, Allele all2) {
return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor
}
private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) {
AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2);
Allele mergedAllele = mergedAlleles.get(all12);
if (mergedAllele == null) {
byte[] bases1 = all1.getBases();
byte[] bases2 = all2.getBases();
byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
System.arraycopy(bases1, 0, mergedBases, 0, bases1.length);
if (intermediateBases != null)
System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength);
System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length);
mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime);
mergedAlleles.put(all12, mergedAllele);
}
return mergedAllele;
}
public Set<Allele> getAllMergedAlleles() {
return new HashSet<Allele>(mergedAlleles.values());
}
}
static class PhaseAndQuality {
public boolean isPhased;
public Double PQ = null;
public PhaseAndQuality(Genotype gt) {
this.isPhased = gt.isPhased();
if (this.isPhased) {
this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
if ( this.PQ == -1 ) this.PQ = null;
}
}
}
}

View File

@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing;
/* PreciseNonNegativeDouble permits arithmetic operations on NON-NEGATIVE double values
with precision (prevents underflow by representing in log10 space).
*/
public class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDouble> {
class PreciseNonNegativeDouble implements Comparable<PreciseNonNegativeDouble> {
private static final double EQUALS_THRESH = 1e-6;
private static final double INFINITY = Double.POSITIVE_INFINITY;

View File

@ -34,7 +34,6 @@ import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.DisjointSet;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.codecs.vcf.*;
@ -1052,7 +1051,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private void writeVCF(VariantContext vc) {
if (samplesToPhase == null || vc.isNotFiltered())
//if ( samplesToPhase == null || (vc.isVariant() && vc.isNotFiltered())) // if we are only operating on specific samples, don't write out all sites, just those where the VC is variant
WriteVCF.writeVCF(vc, writer, logger);
writer.add(vc);
}
public static boolean processVariantInPhasing(VariantContext vc) {

View File

@ -23,7 +23,7 @@
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
public class ReadBase {
class ReadBase {
public String readName;
public byte base;
public int mappingQual;

View File

@ -28,7 +28,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement;
import java.util.Iterator;
import java.util.LinkedList;
public class ReadBasesAtPosition implements Iterable<ReadBase> {
class ReadBasesAtPosition implements Iterable<ReadBase> {
// list of: <read name, base>
private LinkedList<ReadBase> bases;

View File

@ -1,189 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/* Some methods for extracting RefSeq-related data from annotated VCF INFO fields:
*/
public class RefSeqDataParser {
private static String REFSEQ_PREFIX = "refseq.";
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
private static String NAME_KEY = REFSEQ_PREFIX + "name";
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
private static String[] NAME_KEYS = {NAME_KEY, NAME2_KEY};
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY;
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
Map<String, String> entriesToNames = new HashMap<String, String>();
int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
if (numRecords != -1) {
boolean done = false;
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
done = true;
}
}
if (!done) {
for (int i = 1; i <= numRecords; i++) {
String key = nameKeyToUseMultiplePrefix + i;
String name = vc.getAttributeAsString(key, null);
if (name != null)
entriesToNames.put(key, name);
}
}
}
else { // no entry with the # of records:
String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
}
else { // Check all INFO fields for a match (if there are multiple entries):
for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
String key = entry.getKey();
if (key.startsWith(nameKeyToUseMultiplePrefix))
entriesToNames.put(key, entry.getValue().toString());
}
}
}
return entriesToNames;
}
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
return getRefSeqEntriesToNames(vc, false);
}
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
}
public static Set<String> getRefSeqNames(VariantContext vc) {
return getRefSeqNames(vc, false);
}
public static Map<String, Object> getMergedRefSeqNameAttributes(VariantContext vc1, VariantContext vc2) {
Map<String, Object> refSeqNameAttribs = new HashMap<String, Object>();
Map<String, RefSeqEntry> entriesMap1 = getAllRefSeqEntriesByName(vc1);
Map<String, RefSeqEntry> entriesMap2 = getAllRefSeqEntriesByName(vc2);
Set<String> commonNames = entriesMap1.keySet();
commonNames.retainAll(entriesMap2.keySet());
boolean addSuffix = commonNames.size() > 1;
int nextCount = 1;
for (String name : commonNames) {
RefSeqEntry refseq1 = entriesMap1.get(name);
RefSeqEntry refseq2 = entriesMap2.get(name);
String keySuffix = "";
if (addSuffix)
keySuffix = "_" + nextCount;
boolean added = false;
for (String key : NAME_KEYS) {
Object obj1 = refseq1.info.get(key);
Object obj2 = refseq2.info.get(key);
if (obj1 != null && obj2 != null && obj1.equals(obj2)) {
added = true;
String useKey = key + keySuffix;
refSeqNameAttribs.put(useKey, obj1);
}
}
if (added)
nextCount++;
}
int totalCount = nextCount - 1; // since incremented count one extra time
if (totalCount > 1)
refSeqNameAttribs.put(NUM_RECORDS_KEY, totalCount);
return refSeqNameAttribs;
}
public static Map<String, Object> removeRefSeqAttributes(Map<String, Object> attributes) {
Map<String, Object> removedRefSeqAttributes = new HashMap<String, Object>(attributes);
Iterator<Map.Entry<String, Object>> attrIt = removedRefSeqAttributes.entrySet().iterator();
while (attrIt.hasNext()) {
String key = attrIt.next().getKey();
if (key.startsWith(REFSEQ_PREFIX))
attrIt.remove();
}
return removedRefSeqAttributes;
}
private static Map<String, RefSeqEntry> getAllRefSeqEntriesByName(VariantContext vc) {
Map<String, RefSeqEntry> nameToEntries = new TreeMap<String, RefSeqEntry>();
List<RefSeqEntry> allEntries = getAllRefSeqEntries(vc);
for (RefSeqEntry entry : allEntries) {
Object name = entry.info.get(NAME_KEY);
if (name != null)
nameToEntries.put(name.toString(), entry);
}
return nameToEntries;
}
// Returns a List of SEPARATE Map<refseq.ENTRY, refseq.VALUE> for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc.
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
String entry = entryToName.getKey();
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
}
return allRefSeq;
}
private static class RefSeqEntry {
public Map<String, Object> info;
public RefSeqEntry(VariantContext vc, String entrySuffix) {
this.info = new HashMap<String, Object>();
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
String key = attribEntry.getKey();
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
String genericKey = key.replaceAll(entrySuffix, "");
this.info.put(genericKey, attribEntry.getValue());
}
}
}
}
}

View File

@ -28,7 +28,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
public class SNPallelePair extends AllelePair {
class SNPallelePair extends AllelePair {
public SNPallelePair(Genotype gt) {
super(gt);

View File

@ -1,34 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
public class WriteVCF {
public static void writeVCF(VariantContext vc, VCFWriter writer, Logger logger) {
writer.add(vc);
}
}

View File

@ -25,13 +25,10 @@ package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.util.StringUtil;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -929,340 +926,4 @@ public class VariantContextUtils {
public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) {
return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true);
}
public abstract static class AlleleMergeRule {
// vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2):
abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2);
public String toString() {
return "all samples are mergeable";
}
}
// NOTE: returns null if vc1 and vc2 are not merged into a single MNP record
public static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) {
if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2))
return null;
// Check that it's logically possible to merge the VCs:
if (!allSamplesAreMergeable(vc1, vc2))
return null;
// Check if there's a "point" in merging the VCs (e.g., annotations could be changed)
if (!alleleMergeRule.allelesShouldBeMerged(vc1, vc2))
return null;
return reallyMergeIntoMNP(vc1, vc2, referenceFile);
}
private static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) {
int startInter = vc1.getEnd() + 1;
int endInter = vc2.getStart() - 1;
byte[] intermediateBases = null;
if (startInter <= endInter) {
intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases();
StringUtil.toUpperCase(intermediateBases);
}
MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added
GenotypeCollection mergedGenotypes = GenotypeCollection.create();
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
List<Allele> mergedAllelesForSample = new LinkedList<Allele>();
/* NOTE: Since merged alleles are added to mergedAllelesForSample in the SAME order as in the input VC records,
we preserve phase information (if any) relative to whatever precedes vc1:
*/
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
Allele mergedAllele = mergeData.ensureMergedAllele(all1, all2);
mergedAllelesForSample.add(mergedAllele);
}
double mergedGQ = Math.max(gt1.getNegLog10PError(), gt2.getNegLog10PError());
Set<String> mergedGtFilters = new HashSet<String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered
Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
if (phaseQual.PQ != null)
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ);
Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased);
mergedGenotypes.add(mergedGt);
}
String mergedName = VariantContextUtils.mergeVariantContextNames(vc1.getSource(), vc2.getSource());
double mergedNegLog10PError = Math.max(vc1.getNegLog10PError(), vc2.getNegLog10PError());
Set<String> mergedFilters = new HashSet<String>(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered
Map<String, Object> mergedAttribs = VariantContextUtils.mergeVariantContextAttributes(vc1, vc2);
// ids
List<String> mergedIDs = new ArrayList<String>();
if ( vc1.hasID() ) mergedIDs.add(vc1.getID());
if ( vc2.hasID() ) mergedIDs.add(vc2.getID());
String mergedID = Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs);
// TODO -- FIX ID
VariantContext mergedVc = new VariantContext(mergedName, mergedID, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles(), mergedGenotypes, mergedNegLog10PError, mergedFilters, mergedAttribs);
mergedAttribs = new HashMap<String, Object>(mergedVc.getAttributes());
VariantContextUtils.calculateChromosomeCounts(mergedVc, mergedAttribs, true);
mergedVc = VariantContext.modifyAttributes(mergedVc, mergedAttribs);
return mergedVc;
}
private static class AlleleOneAndTwo {
private Allele all1;
private Allele all2;
public AlleleOneAndTwo(Allele all1, Allele all2) {
this.all1 = all1;
this.all2 = all2;
}
public int hashCode() {
return all1.hashCode() + all2.hashCode();
}
public boolean equals(Object other) {
if (!(other instanceof AlleleOneAndTwo))
return false;
AlleleOneAndTwo otherAot = (AlleleOneAndTwo) other;
return (this.all1.equals(otherAot.all1) && this.all2.equals(otherAot.all2));
}
}
private static class MergedAllelesData {
private Map<AlleleOneAndTwo, Allele> mergedAlleles;
private byte[] intermediateBases;
private int intermediateLength;
public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) {
this.mergedAlleles = new HashMap<AlleleOneAndTwo, Allele>(); // implemented equals() and hashCode() for AlleleOneAndTwo
this.intermediateBases = intermediateBases;
this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0;
this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true);
}
public Allele ensureMergedAllele(Allele all1, Allele all2) {
return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor
}
private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) {
AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2);
Allele mergedAllele = mergedAlleles.get(all12);
if (mergedAllele == null) {
byte[] bases1 = all1.getBases();
byte[] bases2 = all2.getBases();
byte[] mergedBases = new byte[bases1.length + intermediateLength + bases2.length];
System.arraycopy(bases1, 0, mergedBases, 0, bases1.length);
if (intermediateBases != null)
System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength);
System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length);
mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime);
mergedAlleles.put(all12, mergedAllele);
}
return mergedAllele;
}
public Set<Allele> getAllMergedAlleles() {
return new HashSet<Allele>(mergedAlleles.values());
}
}
private static String mergeVariantContextNames(String name1, String name2) {
return name1 + "_" + name2;
}
private static Map<String, Object> mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) {
Map<String, Object> mergedAttribs = new HashMap<String, Object>();
List<VariantContext> vcList = new LinkedList<VariantContext>();
vcList.add(vc1);
vcList.add(vc2);
String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY};
for (String orAttrib : MERGE_OR_ATTRIBS) {
boolean attribVal = false;
for (VariantContext vc : vcList) {
attribVal = vc.getAttributeAsBoolean(orAttrib, false);
if (attribVal) // already true, so no reason to continue:
break;
}
mergedAttribs.put(orAttrib, attribVal);
}
return mergedAttribs;
}
private static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) {
GenomeLoc loc1 = VariantContextUtils.getLocation(genomeLocParser, vc1);
GenomeLoc loc2 = VariantContextUtils.getLocation(genomeLocParser, vc2);
if (!loc1.onSameContig(loc2))
throw new ReviewedStingException("Can only merge vc1, vc2 if on the same chromosome");
if (!loc1.isBefore(loc2))
throw new ReviewedStingException("Can only merge if vc1 is BEFORE vc2");
if (vc1.isFiltered() || vc2.isFiltered())
return false;
if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets
return false;
if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2))
return false;
return true;
}
private static boolean allGenotypesAreUnfilteredAndCalled(VariantContext vc) {
for (final Genotype gt : vc.getGenotypes()) {
if (gt.isNoCall() || gt.isFiltered())
return false;
}
return true;
}
// Assumes that vc1 and vc2 were already checked to have the same sample names:
private static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) {
// Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1:
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom
return false;
}
return true;
}
public static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) {
if (gt1.getPloidy() != gt2.getPloidy())
return false;
/* If gt2 is phased or hom, then could even be MERGED with gt1 [This is standard].
HOWEVER, EVEN if this is not the case, but gt1.isHom(),
it is trivially known that each of gt2's alleles segregate with the single allele type present in gt1.
*/
return (gt2.isPhased() || gt2.isHom() || gt1.isHom());
}
private static class PhaseAndQuality {
public boolean isPhased;
public Double PQ = null;
public PhaseAndQuality(Genotype gt) {
this.isPhased = gt.isPhased();
if (this.isPhased) {
this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
if ( this.PQ == -1 ) this.PQ = null;
}
}
}
// Assumes that alleleSegregationIsKnown(gt1, gt2):
private static PhaseAndQuality calcPhaseForMergedGenotypes(Genotype gt1, Genotype gt2) {
if (gt2.isPhased() || gt2.isHom())
return new PhaseAndQuality(gt1); // maintain the phase of gt1
if (!gt1.isHom())
throw new ReviewedStingException("alleleSegregationIsKnown(gt1, gt2) implies: gt2.genotypesArePhased() || gt2.isHom() || gt1.isHom()");
/* We're dealing with: gt1.isHom(), gt2.isHet(), !gt2.genotypesArePhased(); so, the merged (het) Genotype is not phased relative to the previous Genotype
For example, if we're merging the third Genotype with the second one:
0/1
1|1
0/1
Then, we want to output:
0/1
1/2
*/
return new PhaseAndQuality(gt2); // maintain the phase of gt2 [since !gt2.genotypesArePhased()]
}
/* Checks if any sample has a MNP of ALT alleles (segregating together):
[Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)]
*/
public static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) {
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next(); // this is OK, since allSamplesAreMergeable()
if (all1.isNonReference() && all2.isNonReference()) // corresponding alleles are alternate
return true;
}
}
return false;
}
/* Checks if all samples are consistent in their haplotypes:
[Assumes that vc1 and vc2 were already checked to have the same sample names && allSamplesAreMergeable(vc1, vc2)]
*/
public static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) {
// Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference):
Map<Allele, Allele> allele1ToAllele2 = new HashMap<Allele, Allele>();
Map<Allele, Allele> allele2ToAllele1 = new HashMap<Allele, Allele>();
// Note the segregation of the alleles for the reference genome:
allele1ToAllele2.put(vc1.getReference(), vc2.getReference());
allele2ToAllele1.put(vc2.getReference(), vc1.getReference());
// Note the segregation of the alleles for each sample (and check that it is consistent with the reference and all previous samples).
for (final Genotype gt1 : vc1.getGenotypes()) {
Genotype gt2 = vc2.getGenotype(gt1.getSampleName());
List<Allele> site1Alleles = gt1.getAlleles();
List<Allele> site2Alleles = gt2.getAlleles();
Iterator<Allele> all2It = site2Alleles.iterator();
for (Allele all1 : site1Alleles) {
Allele all2 = all2It.next();
Allele all1To2 = allele1ToAllele2.get(all1);
if (all1To2 == null)
allele1ToAllele2.put(all1, all2);
else if (!all1To2.equals(all2)) // all1 segregates with two different alleles at site 2
return false;
Allele all2To1 = allele2ToAllele1.get(all2);
if (all2To1 == null)
allele2ToAllele1.put(all2, all1);
else if (!all2To1.equals(all1)) // all2 segregates with two different alleles at site 1
return false;
}
}
return true;
}
}
}