Totally revamped the MNP annotation and put it in its own walker: AnnotateMNPsWalker

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4730 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-11-24 18:05:10 +00:00
parent 6b70cde0b9
commit ca70ed611c
5 changed files with 961 additions and 159 deletions

View File

@ -0,0 +1,827 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcid;
import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcidTable;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.util.*;
import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;
/**
* Walks along all variant ROD loci, and dynamically annotates alleles at MNP records.
*/
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = AnnotateMNPsWalker.REFSEQ_ROD_NAME, type = AnnotatorInputTableFeature.class), @RMD(name = AnnotateMNPsWalker.VARIANT_ROD_NAME, type = ReferenceOrderedDatum.class)})
public class AnnotateMNPsWalker extends RodWalker<Integer, Integer> {
@Output(doc = "File to which variants should be written", required = true)
protected VCFWriter writer = null;
private ManualSortingVCFWriter sortingWriter = null;
private LinkedList<String> rodNames = null;
private GenomeLocParser locParser = null;
private TreeMap<GenomeLoc, Set<GenomeLoc>> MNPstartToStops = null; // Must be TreeMap sorted by START sites!
public final static String REFSEQ_ROD_NAME = "refseq";
public final static String VARIANT_ROD_NAME = "variant";
private LocusToFeatures locusToRefSeqFeatures = null;
protected final static String MNP_ANNOTATION_KEY_PREFIX = "MNP.refseq.";
protected final static String REFSEQ_NAME = "name";
protected final static String REFSEQ_NAME2 = "name2";
protected final static String REFSEQ_POSITION_TYPE = "positionType";
protected final static String REFSEQ_CDS = "CDS";
protected final static String REFSEQ_STRAND = "transcriptStrand";
protected final static String REFSEQ_POS_STRAND = "+";
protected final static String REFSEQ_NEG_STRAND = "-";
protected final static String REFSEQ_CODON_COORD = "codonCoord";
protected final static String REFSEQ_CODING_FRAME = "frame";
protected final static String REFSEQ_REF_CODON = "referenceCodon";
protected final static String REFSEQ_REF_AA = "referenceAA";
protected final static String REFSEQ_ALT_BASE = "haplotypeAlternate";
protected final static String REFSEQ_VARIANT_CODON = "variantCodon";
protected final static String REFSEQ_VARIANT_AA = "variantAA";
protected final static String REFSEQ_CHANGES_AA = "changesAA";
protected final static String REFSEQ_FUNCTIONAL_CLASS = "functionalClass";
protected final static String REFSEQ_PROTEIN_COORD_DESCRIPTION = "proteinCoordStr";
protected final static String REFSEQ_CODING_ANNOTATIONS = "codingVariants";
public void initialize() {
rodNames = new LinkedList<String>();
rodNames.add(VARIANT_ROD_NAME);
locParser = getToolkit().getGenomeLocParser();
MNPstartToStops = new TreeMap<GenomeLoc, Set<GenomeLoc>>(); // sorted by start sites
initializeVcfWriter();
locusToRefSeqFeatures = new LocusToFeatures();
}
private void initializeVcfWriter() {
sortingWriter = new ManualSortingVCFWriter(writer);
writer = sortingWriter;
// setup the header fields:
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames);
writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples())));
}
public boolean generateExtendedEvents() {
return false;
}
public Integer reduceInit() {
return 0;
}
/**
* For each site of interest, annotate it if it's a MNP.
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return count of MNPs observed
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (tracker == null)
return null;
int numMNPsObserved = 0;
GenomeLoc curLocus = ref.getLocus();
clearOldLocusFeatures(curLocus);
boolean requireStartHere = false; // see EVERY site of the MNP
boolean takeFirstOnly = false; // take as many entries as the VCF file has
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
if (vc.getType() == VariantContext.Type.MNP) {
GenomeLoc MNPloc = VariantContextUtils.getLocation(locParser, vc);
boolean atStartOfMNP = curLocus.getStart() == MNPloc.getStart();
boolean atEndOfMNP = curLocus.getStart() == MNPloc.getStop();
logger.debug("Observed MNP at " + MNPloc);
if (isChrM(vc)) {
if (atStartOfMNP) {
logger.warn("Skipping mitochondrial MNP at " + MNPloc + " due to complexity of coding table [need to know if first codon, etc.]...");
writeVCF(vc);
}
continue;
}
GenomeLoc stopLoc = locParser.createGenomeLoc(curLocus.getContig(), MNPloc.getStop());
final List<Object> refSeqRODs = tracker.getReferenceMetaData(REFSEQ_ROD_NAME);
for (Object refSeqObject : refSeqRODs) {
AnnotatorInputTableFeature refSeqAnnotation = (AnnotatorInputTableFeature) refSeqObject;
locusToRefSeqFeatures.putLocusFeatures(curLocus, refSeqAnnotation, stopLoc);
}
if (atStartOfMNP) { // MNP is starting here, so register that we're waiting for it
Set<GenomeLoc> stopLocs = MNPstartToStops.get(curLocus);
if (stopLocs == null) {
stopLocs = new HashSet<GenomeLoc>();
MNPstartToStops.put(curLocus, stopLocs);
}
stopLocs.add(stopLoc);
}
if (atEndOfMNP) {
numMNPsObserved++; // only count a MNP at its stop site
logger.debug("Observed end of MNP at " + curLocus);
logger.debug("Current list of per-locus features\n" + locusToRefSeqFeatures);
Map<String, Object> MNPannotations = annotateMNP(vc);
MNPannotations.putAll(vc.getAttributes());
vc = VariantContext.modifyAttributes(vc, MNPannotations);
writeVCF(vc);
GenomeLoc startLoc = locParser.createGenomeLoc(curLocus.getContig(), MNPloc.getStart());
Set<GenomeLoc> stopLocs = MNPstartToStops.get(startLoc);
if (stopLocs != null) { // otherwise, just removed stopLocs due to another MNP that has the same (start, stop)
stopLocs.remove(stopLoc);
if (stopLocs.isEmpty()) // no longer waiting for startLoc
MNPstartToStops.remove(startLoc);
}
}
}
else {
writeVCF(vc);
}
}
Integer mostUpstreamWritableLoc = null;
if (!MNPstartToStops.isEmpty()) {
GenomeLoc waitingForLoc = MNPstartToStops.entrySet().iterator().next().getKey();
mostUpstreamWritableLoc = waitingForLoc.getStart() - 1;
}
sortingWriter.setmostUpstreamWritableLocus(mostUpstreamWritableLoc);
return numMNPsObserved;
}
private static boolean isChrM(final VariantContext vc) {
return vc.getChr().equals("chrM") || vc.getChr().equals("MT");
}
private Map<String, Object> annotateMNP(VariantContext vc) {
Map<String, Object> annotations = new HashMap<String, Object>();
RefSeqNameToFeatures nameToPositionalFeatures = new RefSeqNameToFeatures(vc);
MNPannotationKeyBuilder kb = new MNPannotationKeyBuilder(nameToPositionalFeatures);
for (Map.Entry<String, RefSeqFeatureList> nameToFeatureEntry : nameToPositionalFeatures.entrySet()) {
String featureName = nameToFeatureEntry.getKey();
RefSeqFeatureList feature = nameToFeatureEntry.getValue();
CodonAnnotationsForAltAlleles codonAnnotationsForAlleles = new CodonAnnotationsForAltAlleles(vc, feature);
annotations.put(kb.getKey(REFSEQ_CODING_ANNOTATIONS), codonAnnotationsForAlleles.toString());
annotations.put(kb.getKey(REFSEQ_NAME), featureName);
annotations.put(kb.getKey(REFSEQ_NAME2), feature.name2);
annotations.put(kb.getKey(REFSEQ_POSITION_TYPE), REFSEQ_CDS);
annotations.put(kb.getKey(REFSEQ_STRAND), (feature.positiveStrand ? REFSEQ_POS_STRAND : REFSEQ_NEG_STRAND));
annotations.put(kb.getKey(REFSEQ_CODON_COORD), feature.getCodonCoordString());
kb.incrementFeatureIndex();
}
return annotations;
}
private static class MNPannotationKeyBuilder {
private int featureIndex;
private boolean multipleEntries;
public MNPannotationKeyBuilder(RefSeqNameToFeatures nameToPositionalFeatures) {
this.featureIndex = 1;
this.multipleEntries = nameToPositionalFeatures.nameToFeatures.size() > 1;
}
public void incrementFeatureIndex() {
featureIndex++;
}
public String getKey(String type) {
String annotationKey = MNP_ANNOTATION_KEY_PREFIX + type;
if (multipleEntries)
annotationKey += "_" + featureIndex;
return annotationKey;
}
}
private static byte[] ByteArrayToPrimitive(Byte[] nonNullArray) {
byte[] primArray = new byte[nonNullArray.length];
for (int i = 0; i < nonNullArray.length; i++) {
if (nonNullArray[i] == null)
throw new ReviewedStingException("nonNullArray[i] == null");
primArray[i] = nonNullArray[i];
}
return primArray;
}
private void clearOldLocusFeatures(GenomeLoc curLoc) {
Iterator<Map.Entry<GenomeLoc, PositionalRefSeqFeatures>> locusFeaturesIt = locusToRefSeqFeatures.entrySet().iterator();
while (locusFeaturesIt.hasNext()) {
Map.Entry<GenomeLoc, PositionalRefSeqFeatures> locusFeaturesEntry = locusFeaturesIt.next();
if (curLoc.isPast(locusFeaturesEntry.getValue().getFurthestLocusUsingFeatures()))
locusFeaturesIt.remove();
}
}
public Integer reduce(Integer count, Integer total) {
if (count != null)
total = total + count;
return total;
}
/**
* @param result the number of MNPs processed.
*/
public void onTraversalDone(Integer result) {
System.out.println("Number of MNPs observed: " + result);
writer.close();
}
private void writeVCF(VariantContext vc) {
byte refBase;
if (!vc.isIndel()) {
Allele varAllele = vc.getReference();
refBase = SNPallelePair.getSingleBase(varAllele);
}
else {
refBase = vc.getReferenceBaseForIndel();
}
writer.add(vc, refBase);
}
/*
Inner classes:
*/
// Maps: RefSeq entry name -> features for ALL positions of a particular VariantContext MNP:
private class RefSeqNameToFeatures {
private Map<String, RefSeqFeatureList> nameToFeatures;
public RefSeqNameToFeatures(VariantContext vc) {
this.nameToFeatures = new HashMap<String, RefSeqFeatureList>();
int MNPstart = vc.getStart();
int MNPstop = vc.getEnd();
int MNPlength = MNPstop - MNPstart + 1;
for (int i = 0; i < MNPlength; i++) {
int genomicPosition = MNPstart + i;
GenomeLoc posLoc = locParser.createGenomeLoc(vc.getChr(), genomicPosition);
PositionalRefSeqFeatures locFeatures = locusToRefSeqFeatures.getLocusFeatures(posLoc);
if (locFeatures == null) // no features for posLoc
continue;
for (Map.Entry<String, PositionalRefSeqFeature> nameToFeatureEntry : locFeatures.entrySet()) {
String name = nameToFeatureEntry.getKey();
PositionalRefSeqFeature posFeature = nameToFeatureEntry.getValue();
RefSeqFeatureList featureList = nameToFeatures.get(name);
if (featureList == null) {
featureList = new RefSeqFeatureList(MNPlength);
nameToFeatures.put(name, featureList);
}
featureList.updateFeatureAtPosition(i, posFeature);
}
}
}
public Set<Map.Entry<String, RefSeqFeatureList>> entrySet() {
return nameToFeatures.entrySet();
}
}
// For a particular RefSeq entry, contains the features for ALL positions of a particular VariantContext MNP
private static class RefSeqFeatureList {
private final static String CODON_FRAME_START = "(";
private final static String CODON_FRAME_END = ")";
private final static String CODON_DELIM = "|";
private CodingRefSeqFeature[] refSeqFeatures;
private String name2;
private Boolean positiveStrand;
private Map<Integer, List<Integer>> codonToIndices; // Map of: codon index -> MNP indices that refer to codon
public RefSeqFeatureList(int MNPlength) {
this.refSeqFeatures = new CodingRefSeqFeature[MNPlength];
for (int i = 0; i < MNPlength; i++)
this.refSeqFeatures[i] = null;
this.name2 = null;
this.positiveStrand = null;
this.codonToIndices = new TreeMap<Integer, List<Integer>>();
}
public void updateFeatureAtPosition(int index, PositionalRefSeqFeature feature) {
if (name2 == null) {
name2 = feature.name2;
positiveStrand = feature.positiveStrand;
}
else if (!name2.equals(feature.name2) || positiveStrand != feature.positiveStrand) {
throw new UserException("Inconsistency between previous RefSeq entry and: " + feature);
}
CodingRefSeqFeature crsf = new CodingRefSeqFeature(feature);
refSeqFeatures[index] = crsf;
List<Integer> indicesWithCodon = codonToIndices.get(crsf.codonCoord);
if (indicesWithCodon == null) {
indicesWithCodon = new LinkedList<Integer>();
codonToIndices.put(crsf.codonCoord, indicesWithCodon);
}
indicesWithCodon.add(index);
}
public Set<Map.Entry<Integer, List<Integer>>> codonIndicesEntrySet() {
return codonToIndices.entrySet();
}
public String getCodonCoordString() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < refSeqFeatures.length; i++) {
CodingRefSeqFeature crsf = refSeqFeatures[i];
if (crsf != null)
sb.append(crsf.codonCoord).append(CODON_FRAME_START).append(crsf.codingFrame).append(CODON_FRAME_END);
if (i < refSeqFeatures.length - 1)
sb.append(CODON_DELIM);
}
return sb.toString();
}
}
private static class CodingRefSeqFeature {
protected int codonCoord;
protected int codingFrame;
protected String referenceCodon;
protected String referenceAA;
public CodingRefSeqFeature(PositionalRefSeqFeature feature) {
this.codonCoord = feature.codonCoord;
this.codingFrame = feature.codingFrame;
this.referenceCodon = feature.referenceCodon.toUpperCase();
this.referenceAA = feature.referenceAA;
}
}
private static class CodonAnnotationsForAltAlleles {
protected final static int MIN_CODON_INDEX = 0;
protected final static int NUM_CODON_INDICES = 3;
private final static String CODON_ANNOTATION_DELIM = ",";
private List<SingleCodonAnnotationsForAlleles> alleleAnnotations;
public CodonAnnotationsForAltAlleles(VariantContext vc, RefSeqFeatureList feature) {
this.alleleAnnotations = new LinkedList<SingleCodonAnnotationsForAlleles>();
int MNPstart = vc.getStart();
int MNPstop = vc.getEnd();
int MNPlength = MNPstop - MNPstart + 1;
for (Map.Entry<Integer, List<Integer>> codonToIndicesEntry : feature.codonIndicesEntrySet()) {
int codonIndex = codonToIndicesEntry.getKey();
List<Integer> indices = codonToIndicesEntry.getValue();
if (indices.isEmpty())
throw new ReviewedStingException("indices should not exist if it's empty!");
for (int index : indices) {
int frame = feature.refSeqFeatures[index].codingFrame;
if (feature.refSeqFeatures[index].codonCoord != codonIndex)
throw new ReviewedStingException("LOGICAL ERROR: feature.refSeqFeatures[index].codonCoord != codonIndex");
if (frame < MIN_CODON_INDEX || frame >= NUM_CODON_INDICES)
throw new UserException("RefSeq codon frame not one of {0,1,2}");
}
CodingRefSeqFeature firstFeatureForCodon = feature.refSeqFeatures[indices.get(0)];
String refCodon = firstFeatureForCodon.referenceCodon;
SingleCodonAnnotationsForAlleles codonAnnotation = new SingleCodonAnnotationsForAlleles(codonIndex, vc.getAlternateAlleles(), MNPlength, refCodon, firstFeatureForCodon, indices, feature);
alleleAnnotations.add(codonAnnotation);
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
int index = 0;
for (SingleCodonAnnotationsForAlleles codonToAlleles : alleleAnnotations) {
sb.append(codonToAlleles);
if (index < alleleAnnotations.size() - 1)
sb.append(CODON_ANNOTATION_DELIM);
index++;
}
return sb.toString();
}
}
private static class SingleCodonAnnotationsForAlleles {
private final static String CODON_MAP_SYMBOL = "->";
private final static String CODON_ANNOTATION_START = "[";
private final static String CODON_ANNOTATION_END = "]";
private final static String REF_CODON_INFO_DELIM = "|";
private final static String ALLELE_ANNOTATION_DELIM = ",";
private final static String ASSIGNMENT = ":";
private int codonIndex;
private String refCodon;
private String refAA;
private List<SingleCodonAnnotationsForAllele> annotationsForAlleles;
public SingleCodonAnnotationsForAlleles(int codonIndex, Collection<Allele> altAlleles, int MNPlength, String refCodon, CodingRefSeqFeature firstFeatureForCodon, List<Integer> indices, RefSeqFeatureList feature) {
if (refCodon.length() != CodonAnnotationsForAltAlleles.NUM_CODON_INDICES)
throw new UserException("RefSeq reference codon " + refCodon + " is not of length " + CodonAnnotationsForAltAlleles.NUM_CODON_INDICES);
AminoAcid refAA = AminoAcidTable.getEukaryoticAA(refCodon);
if (!refAA.getCode().equals(firstFeatureForCodon.referenceAA))
throw new UserException("RefSeq: translated reference codon= " + refAA + " != " + firstFeatureForCodon.referenceAA + " = reference AA");
this.codonIndex = codonIndex;
this.refCodon = refCodon;
this.refAA = refAA.getCode();
this.annotationsForAlleles = new LinkedList<SingleCodonAnnotationsForAllele>();
for (Allele altAllele : altAlleles) {
if (altAllele.length() != MNPlength)
throw new ReviewedStingException("length(altAllele) != length(MNP)");
byte[] altBases = altAllele.getBases();
Byte[] variantCodonArr = new Byte[CodonAnnotationsForAltAlleles.NUM_CODON_INDICES];
for (int i = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; i < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; i++)
variantCodonArr[i] = null;
for (int index : indices) {
int frame = feature.refSeqFeatures[index].codingFrame;
if (variantCodonArr[frame] != null)
throw new UserException("RefSeq assigns codon " + codonIndex + " twice at same frame: " + frame);
byte base = altBases[index];
if (!feature.positiveStrand) // negative strand codon
base = BaseUtils.simpleComplement(base);
variantCodonArr[frame] = base;
}
/* For missing frames, there MUST exist AT LEAST one index that refers to this codon,
so use it to derive the missing bases [ALREADY complemented if on the negative strand]:
*/
for (int frame = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; frame < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; frame++) {
if (variantCodonArr[frame] == null)
variantCodonArr[frame] = (byte) refCodon.charAt(frame);
}
String variantCodon = new String(ByteArrayToPrimitive(variantCodonArr)).toUpperCase();
SingleCodonAnnotationsForAllele alleleAnnotation = new SingleCodonAnnotationsForAllele(variantCodon, refCodon, refAA, codonIndex);
annotationsForAlleles.add(alleleAnnotation);
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(codonIndex).append(CODON_MAP_SYMBOL).append(CODON_ANNOTATION_START);
sb.append(REFSEQ_REF_CODON).append(ASSIGNMENT).append(refCodon).append(REF_CODON_INFO_DELIM);
sb.append(REFSEQ_REF_AA).append(ASSIGNMENT).append(refAA).append(REF_CODON_INFO_DELIM);
int index = 0;
for (SingleCodonAnnotationsForAllele annotation : annotationsForAlleles) {
sb.append(annotation);
if (index < annotationsForAlleles.size() - 1)
sb.append(ALLELE_ANNOTATION_DELIM);
index++;
}
sb.append(CODON_ANNOTATION_END);
return sb.toString();
}
}
private static class SingleCodonAnnotationsForAllele {
private final static String ALLELE_START = "{";
private final static String ALLELE_END = "}";
private final static String CODON_INFO_DELIM = "|";
private final static String ASSIGNMENT = ":";
private String variantCodon;
private String variantAA;
private boolean changesAA;
private String functionalClass;
private String proteinCoordStr;
public SingleCodonAnnotationsForAllele(String variantCodon, String refCodon, AminoAcid refAA, int codonIndex) {
this.variantCodon = variantCodon;
AminoAcid variantAA = AminoAcidTable.getEukaryoticAA(this.variantCodon);
this.variantAA = variantAA.getCode();
this.changesAA = !refAA.equals(variantAA);
if (!this.variantCodon.equals(refCodon)) {
if (changesAA) {
if (variantAA.isStop()) {
functionalClass = "nonsense";
}
else if (refAA.isStop()) {
functionalClass = "readthrough";
}
else {
functionalClass = "missense";
}
}
else { // the same aa:
functionalClass = "silent";
}
}
else { // the same codon:
functionalClass = "no_change";
}
this.proteinCoordStr = "p." + refAA.getLetter() + codonIndex + variantAA.getLetter();
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(ALLELE_START);
sb.append(REFSEQ_VARIANT_CODON).append(ASSIGNMENT).append(variantCodon).append(CODON_INFO_DELIM);
sb.append(REFSEQ_VARIANT_AA).append(ASSIGNMENT).append(variantAA).append(CODON_INFO_DELIM);
sb.append(REFSEQ_CHANGES_AA).append(ASSIGNMENT).append(changesAA).append(CODON_INFO_DELIM);
sb.append(REFSEQ_FUNCTIONAL_CLASS).append(ASSIGNMENT).append(functionalClass).append(CODON_INFO_DELIM);
sb.append(REFSEQ_PROTEIN_COORD_DESCRIPTION).append(ASSIGNMENT).append(proteinCoordStr);
sb.append(ALLELE_END);
return sb.toString();
}
}
}
// External classes:
class LocusToFeatures {
private Map<GenomeLoc, PositionalRefSeqFeatures> locusToFeatures;
public LocusToFeatures() {
this.locusToFeatures = new TreeMap<GenomeLoc, PositionalRefSeqFeatures>();
}
public PositionalRefSeqFeatures getLocusFeatures(GenomeLoc loc) {
return locusToFeatures.get(loc);
}
public void putLocusFeatures(GenomeLoc loc, AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) {
PositionalRefSeqFeatures locFeatures = locusToFeatures.get(loc);
if (locFeatures == null) {
locFeatures = new PositionalRefSeqFeatures(locusUsingThis);
locusToFeatures.put(loc, locFeatures);
}
locFeatures.putFeature(refSeqAnnotation, locusUsingThis);
}
public Set<Map.Entry<GenomeLoc, PositionalRefSeqFeatures>> entrySet() {
return locusToFeatures.entrySet();
}
public String toString() { // INTERNAL use only
StringBuilder sb = new StringBuilder();
for (Map.Entry<GenomeLoc, PositionalRefSeqFeatures> locFeatures : entrySet()) {
GenomeLoc loc = locFeatures.getKey();
PositionalRefSeqFeatures features = locFeatures.getValue();
sb.append("Locus: ").append(loc).append("\n").append(features);
}
return sb.toString();
}
}
class PositionalRefSeqFeatures {
private final static String[] REQUIRE_COLUMNS =
{AnnotateMNPsWalker.REFSEQ_NAME, AnnotateMNPsWalker.REFSEQ_POSITION_TYPE};
private Map<String, PositionalRefSeqFeature> nameToFeature;
private GenomeLoc furthestLocusUsingFeatures;
public PositionalRefSeqFeatures(GenomeLoc locusUsingThis) {
this.nameToFeature = new HashMap<String, PositionalRefSeqFeature>();
this.furthestLocusUsingFeatures = locusUsingThis;
}
public void putFeature(AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) {
for (String column : REQUIRE_COLUMNS) {
if (!refSeqAnnotation.containsColumnName(column))
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
}
if (locusUsingThis.isPast(furthestLocusUsingFeatures))
furthestLocusUsingFeatures = locusUsingThis;
String posType = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_POSITION_TYPE);
if (!posType.equals(AnnotateMNPsWalker.REFSEQ_CDS)) // only interested in coding sequence annotations
return;
PositionalRefSeqFeature newLocusFeature = new PositionalRefSeqFeature(refSeqAnnotation);
String refSeqName = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME);
PositionalRefSeqFeature locusFeature = nameToFeature.get(refSeqName);
if (locusFeature == null) {
locusFeature = newLocusFeature;
nameToFeature.put(refSeqName, locusFeature);
}
else if (!locusFeature.equals(newLocusFeature)) {
throw new UserException("Inconsistency between previous RefSeq entry and: " + refSeqAnnotation);
}
locusFeature.updateFeature(refSeqAnnotation);
}
public GenomeLoc getFurthestLocusUsingFeatures() {
return furthestLocusUsingFeatures;
}
public Set<Map.Entry<String, PositionalRefSeqFeature>> entrySet() {
return nameToFeature.entrySet();
}
public String toString() { // INTERNAL use only
StringBuilder sb = new StringBuilder();
for (Map.Entry<String, PositionalRefSeqFeature> nameFeatureEntry : entrySet()) {
String name = nameFeatureEntry.getKey();
PositionalRefSeqFeature feature = nameFeatureEntry.getValue();
sb.append(name).append(" -> [").append(feature).append("]\n");
}
return sb.toString();
}
}
class PositionalRefSeqFeature {
private final static String[] REQUIRE_COLUMNS =
{AnnotateMNPsWalker.REFSEQ_NAME2, AnnotateMNPsWalker.REFSEQ_STRAND,
AnnotateMNPsWalker.REFSEQ_CODON_COORD, AnnotateMNPsWalker.REFSEQ_CODING_FRAME,
AnnotateMNPsWalker.REFSEQ_REF_CODON, AnnotateMNPsWalker.REFSEQ_REF_AA};
protected String name2;
protected boolean positiveStrand;
protected int codonCoord;
protected int codingFrame;
protected String referenceCodon;
protected String referenceAA;
private Map<String, BaseAnnotations> baseToAnnotations;
public PositionalRefSeqFeature(AnnotatorInputTableFeature refSeqAnnotation) {
for (String column : REQUIRE_COLUMNS) {
if (!refSeqAnnotation.containsColumnName(column))
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
}
this.name2 = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME2);
this.positiveStrand = (refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_STRAND).equals(AnnotateMNPsWalker.REFSEQ_POS_STRAND));
this.codonCoord = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODON_COORD));
this.codingFrame = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODING_FRAME));
this.referenceCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_CODON);
this.referenceAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_AA);
this.baseToAnnotations = new HashMap<String, BaseAnnotations>();
}
public boolean equals(PositionalRefSeqFeature that) {
return this.name2.equals(that.name2) && this.positiveStrand == that.positiveStrand && this.codonCoord == that.codonCoord && this.codingFrame == that.codingFrame
&& this.referenceCodon.equals(that.referenceCodon) && this.referenceAA.equals(that.referenceAA);
}
public void updateFeature(AnnotatorInputTableFeature refSeqAnnotation) {
if (!refSeqAnnotation.containsColumnName(AnnotateMNPsWalker.REFSEQ_ALT_BASE))
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + AnnotateMNPsWalker.REFSEQ_ALT_BASE);
String base = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_ALT_BASE);
baseToAnnotations.put(base, new BaseAnnotations(refSeqAnnotation));
}
public String toString() { // INTERNAL use only
StringBuilder sb = new StringBuilder();
sb.append("name2= ").append(name2);
sb.append(", positiveStrand= ").append(positiveStrand);
sb.append(", codonCoord= ").append(codonCoord);
sb.append(", codingFrame= ").append(codingFrame);
sb.append(", referenceCodon= ").append(referenceCodon);
sb.append(", referenceAA= ").append(referenceAA);
sb.append(", baseAnnotations= {");
for (Map.Entry<String, BaseAnnotations> baseToAnnotationsEntry : baseToAnnotations.entrySet()) {
String base = baseToAnnotationsEntry.getKey();
BaseAnnotations annotations = baseToAnnotationsEntry.getValue();
sb.append(" ").append(base).append(" -> {").append(annotations).append("}");
}
sb.append(" }");
return sb.toString();
}
}
class BaseAnnotations {
private final static String[] REQUIRE_COLUMNS =
{AnnotateMNPsWalker.REFSEQ_VARIANT_CODON, AnnotateMNPsWalker.REFSEQ_VARIANT_AA,
AnnotateMNPsWalker.REFSEQ_CHANGES_AA, AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS,
AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION};
protected String variantCodon;
protected String variantAA;
protected boolean changesAA;
protected String functionalClass;
protected String proteinCoordStr;
public BaseAnnotations(AnnotatorInputTableFeature refSeqAnnotation) {
for (String column : REQUIRE_COLUMNS) {
if (!refSeqAnnotation.containsColumnName(column))
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
}
this.variantCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_CODON);
this.variantAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_AA);
this.changesAA = Boolean.parseBoolean(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CHANGES_AA));
this.functionalClass = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS);
this.proteinCoordStr = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION);
}
public String toString() { // INTERNAL use only
StringBuilder sb = new StringBuilder();
sb.append("variantCodon= ").append(variantCodon);
sb.append(", variantAA= ").append(variantAA);
sb.append(", changesAA= ").append(changesAA);
sb.append(", functionalClass= ").append(functionalClass);
sb.append(", proteinCoordStr= ").append(proteinCoordStr);
return sb.toString();
}
}

View File

@ -169,14 +169,8 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
if (shouldMerge) {
numRecordsSatisfyingMergeRule++;
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
if (mergedVc != null) {
mergedRecords = true;
Map<String, Object> updatedAttribs = RefSeqData.getMergedRefSeqAttributes(vcfrWaitingToMerge.vc, vc);
updatedAttribs.putAll(mergedVc.getAttributes());
mergedVc = VariantContext.modifyAttributes(mergedVc, updatedAttribs);
vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true);
numMergedRecords++;
}
@ -446,132 +440,4 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
return sb.toString();
}
}
}
/* Some methods for extracting and merging RefSeq-related data from annotated VCF INFO fields:
*/
class RefSeqData {
private static String REFSEQ_PREFIX = "refseq.";
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
private static String NAME_KEY = REFSEQ_PREFIX + "name";
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
private static String CODON_KEY = REFSEQ_PREFIX + "codonCoord";
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
Map<String, String> entriesToNames = new HashMap<String, String>();
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
if (numRecords != null) {
for (int i = 1; i <= numRecords; i++) {
String key = NAME_KEY + "_" + i;
String name = vc.getAttributeAsStringNoException(key);
if (name != null)
entriesToNames.put(key, name);
}
}
else {
String name = vc.getAttributeAsStringNoException(NAME_KEY);
if (name != null) {
entriesToNames.put(NAME_KEY, name);
}
else { // Check all INFO fields for a match:
for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
String key = entry.getKey();
if (getName2 && key.startsWith(NAME2_KEY))
entriesToNames.put(key, entry.getValue().toString());
else if (key.startsWith(NAME_KEY) && !key.startsWith(NAME2_KEY))
entriesToNames.put(key, entry.getValue().toString());
}
}
}
return entriesToNames;
}
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
return getRefSeqEntriesToNames(vc, false);
}
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
}
public static Set<String> getRefSeqNames(VariantContext vc) {
return getRefSeqNames(vc, false);
}
public static Map<String, Object> getMergedRefSeqAttributes(VariantContext vc1, VariantContext vc2) {
Map<String, Object> refSeqAttribs = new HashMap<String, Object>();
List<RefSeqEntry> list1 = getAllRefSeqEntries(vc1);
List<RefSeqEntry> list2 = getAllRefSeqEntries(vc2);
boolean addSuffix = list1.size() > 1 || list2.size() > 1;
int count = 1;
for (RefSeqEntry refseq1 : list1) {
for (RefSeqEntry refseq2 : list2) {
Set<String> keys = new HashSet<String>();
keys.addAll(refseq1.info.keySet());
keys.addAll(refseq2.info.keySet());
String keySuffix = "";
if (addSuffix)
keySuffix = "_" + count++;
Object name1 = refseq1.info.get(NAME_KEY);
Object name2 = refseq2.info.get(NAME_KEY);
boolean sameGene = name1 != null && name2 != null && name1.equals(name2);
for (String key : keys) {
Object obj1 = refseq1.info.get(key);
Object obj2 = refseq2.info.get(key);
if (obj1 == null)
obj1 = "";
if (obj2 == null)
obj2 = "";
if (sameGene && key.equals(CODON_KEY) && obj1.equals(obj2)) // vc1 and vc2 have variants in the same codon in the same gene
System.out.println(vc1.getChr() + ":" + vc1.getStart() + " --> CODON: obj1 = " + obj1);
String useKey = key + keySuffix;
String mergedVal = obj1 + "\\" + obj2;
refSeqAttribs.put(useKey, mergedVal);
}
}
}
return refSeqAttribs;
}
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
String entry = entryToName.getKey();
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
}
return allRefSeq;
}
private static class RefSeqEntry {
public Map<String, Object> info;
public RefSeqEntry(VariantContext vc, String entrySuffix) {
this.info = new HashMap<String, Object>();
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
String key = attribEntry.getKey();
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
String genericKey = key.replaceAll(entrySuffix, "");
this.info.put(genericKey, attribEntry.getValue());
}
}
}
}
}

View File

@ -70,12 +70,12 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
protected boolean disablePrintAlternateAlleleStatistics = false;
public final static String IGNORE_CODING = "IGNORE";
public final static String UNION_CODING = "UNION";
public final static String INTERSECT_CODING = "INTERSECT";
public final static String IGNORE_REFSEQ = "IGNORE";
public final static String UNION_REFSEQ = "UNION";
public final static String INTERSECT_REFSEQ = "INTERSECT";
@Argument(fullName = "mergeBasedOnCodingAnnotation", shortName = "mergeBasedOnCodingAnnotation", doc = "'Should merging be performed if two sites lie on the same coding sequence in the INFO field {" + IGNORE_CODING + ", " + UNION_CODING + ", " + INTERSECT_CODING + "}; [default:"+ IGNORE_CODING + "]", required = false)
protected String mergeBasedOnCodingAnnotation = IGNORE_CODING;
@Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:"+ IGNORE_REFSEQ + "]", required = false)
protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ;
private LinkedList<String> rodNames = null;
@ -90,10 +90,10 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();
MergeRule mergeRule = null;
if (mergeBasedOnCodingAnnotation.equals(IGNORE_CODING))
if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ))
mergeRule = new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser);
else
mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnCodingAnnotation);
mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnRefSeqAnnotation);
// false <-> don't take control of writer, since didn't create it:
vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,genomeLocParser, getToolkit().getArguments().referenceFile, mergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
@ -175,7 +175,7 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
}
enum MergeBasedOnCodingAnnotation {
enum MergeBasedOnRefSeqAnnotation {
UNION_WITH_DIST, INTERSECT_WITH_DIST
}
@ -206,45 +206,45 @@ class DistanceMergeRule implements MergeRule {
}
class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
private MergeBasedOnCodingAnnotation mergeBasedOnCodingAnnotation;
private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation;
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnCodingAnnotation) {
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnRefSeqAnnotation) {
super(maxGenomicDistanceForMNP, genomeLocParser);
if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_CODING))
this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.UNION_WITH_DIST;
else if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING))
this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST;
if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ))
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST;
else if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ))
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST;
else
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_CODING + ", " + MergeSegregatingAlternateAllelesWalker.UNION_CODING + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING + " as argument to mergeBasedOnCodingAnnotation!");
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!");
}
public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
boolean withinDistance = super.shouldMerge(vc1, vc2);
if (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST)
if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST)
return withinDistance || sameGene(vc1, vc2);
else // mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST
else // mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST
return withinDistance && sameGene(vc1, vc2);
}
private boolean sameGene(VariantContext vc1, VariantContext vc2) {
Set<String> names_vc1 = RefSeqData.getRefSeqNames(vc1);
Set<String> names_vc2 = RefSeqData.getRefSeqNames(vc2);
Set<String> names_vc1 = RefSeqDataParser.getRefSeqNames(vc1);
Set<String> names_vc2 = RefSeqDataParser.getRefSeqNames(vc2);
names_vc1.retainAll(names_vc2);
if (!names_vc1.isEmpty())
return true;
// Check refseq.name2:
Set<String> names2_vc1 = RefSeqData.getRefSeqNames(vc1, true);
Set<String> names2_vc2 = RefSeqData.getRefSeqNames(vc2, true);
Set<String> names2_vc1 = RefSeqDataParser.getRefSeqNames(vc1, true);
Set<String> names2_vc2 = RefSeqDataParser.getRefSeqNames(vc2, true);
names2_vc1.retainAll(names2_vc2);
return !names2_vc1.isEmpty();
}
public String toString() {
return super.toString() + " " + (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
}
}

View File

@ -543,7 +543,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
}
private class EdgeToReads {
private Map<GraphEdge, List<String>> edgeReads;
private TreeMap<GraphEdge, List<String>> edgeReads;
public EdgeToReads() {
this.edgeReads = new TreeMap<GraphEdge, List<String>>(); // implemented GraphEdge.compareTo()
@ -1196,7 +1196,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
}
public static PhasingTable marginalizeAsNewTable(PhasingTable table) {
Map<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
TreeMap<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
for (PhasingTable.PhasingTableEntry pte : table) {
Haplotype rep = pte.getHaplotypeClass().getRepresentative();
PreciseNonNegativeDouble score = hapMap.get(rep);
@ -1383,7 +1383,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private class MultipleBaseCountsWriter {
private BufferedWriter writer = null;
private Map<SampleReadLocus, MultipleBaseCounts> multipleBaseCounts = null;
private TreeMap<SampleReadLocus, MultipleBaseCounts> multipleBaseCounts = null;
public MultipleBaseCountsWriter(File outputMultipleBaseCountsFile) {
FileOutputStream output;

View File

@ -0,0 +1,109 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broad.tribble.util.variantcontext.VariantContext;
import java.util.*;
/* Some methods for extracting RefSeq-related data from annotated VCF INFO fields:
*/
public class RefSeqDataParser {
private static String REFSEQ_PREFIX = "refseq.";
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
private static String NAME_KEY = REFSEQ_PREFIX + "name";
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY;
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
Map<String, String> entriesToNames = new HashMap<String, String>();
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
if (numRecords != null) {
for (int i = 1; i <= numRecords; i++) {
String key = nameKeyToUseMultiplePrefix + i;
String name = vc.getAttributeAsStringNoException(key);
if (name != null)
entriesToNames.put(key, name);
}
}
else { // no entry with the # of records:
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
}
else { // Check all INFO fields for a match (if there are multiple entries):
for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
String key = entry.getKey();
if (key.startsWith(nameKeyToUseMultiplePrefix))
entriesToNames.put(key, entry.getValue().toString());
}
}
}
return entriesToNames;
}
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
return getRefSeqEntriesToNames(vc, false);
}
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
}
public static Set<String> getRefSeqNames(VariantContext vc) {
return getRefSeqNames(vc, false);
}
// Returns a List of SEPARATE Map<refseq.ENTRY, refseq.VALUE> for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc.
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
String entry = entryToName.getKey();
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
}
return allRefSeq;
}
private static class RefSeqEntry {
public Map<String, Object> info;
public RefSeqEntry(VariantContext vc, String entrySuffix) {
this.info = new HashMap<String, Object>();
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
String key = attribEntry.getKey();
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
String genericKey = key.replaceAll(entrySuffix, "");
this.info.put(genericKey, attribEntry.getValue());
}
}
}
}
}