Totally revamped the MNP annotation and put it in its own walker: AnnotateMNPsWalker
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4730 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6b70cde0b9
commit
ca70ed611c
|
|
@ -0,0 +1,827 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcid;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcidTable;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods;
|
||||
|
||||
|
||||
/**
|
||||
* Walks along all variant ROD loci, and dynamically annotates alleles at MNP records.
|
||||
*/
|
||||
@Allows(value = {DataSource.REFERENCE})
|
||||
@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = AnnotateMNPsWalker.REFSEQ_ROD_NAME, type = AnnotatorInputTableFeature.class), @RMD(name = AnnotateMNPsWalker.VARIANT_ROD_NAME, type = ReferenceOrderedDatum.class)})
|
||||
|
||||
public class AnnotateMNPsWalker extends RodWalker<Integer, Integer> {
|
||||
|
||||
@Output(doc = "File to which variants should be written", required = true)
|
||||
protected VCFWriter writer = null;
|
||||
private ManualSortingVCFWriter sortingWriter = null;
|
||||
|
||||
private LinkedList<String> rodNames = null;
|
||||
private GenomeLocParser locParser = null;
|
||||
private TreeMap<GenomeLoc, Set<GenomeLoc>> MNPstartToStops = null; // Must be TreeMap sorted by START sites!
|
||||
|
||||
public final static String REFSEQ_ROD_NAME = "refseq";
|
||||
public final static String VARIANT_ROD_NAME = "variant";
|
||||
|
||||
private LocusToFeatures locusToRefSeqFeatures = null;
|
||||
|
||||
|
||||
protected final static String MNP_ANNOTATION_KEY_PREFIX = "MNP.refseq.";
|
||||
|
||||
protected final static String REFSEQ_NAME = "name";
|
||||
protected final static String REFSEQ_NAME2 = "name2";
|
||||
|
||||
protected final static String REFSEQ_POSITION_TYPE = "positionType";
|
||||
protected final static String REFSEQ_CDS = "CDS";
|
||||
|
||||
protected final static String REFSEQ_STRAND = "transcriptStrand";
|
||||
protected final static String REFSEQ_POS_STRAND = "+";
|
||||
protected final static String REFSEQ_NEG_STRAND = "-";
|
||||
|
||||
protected final static String REFSEQ_CODON_COORD = "codonCoord";
|
||||
protected final static String REFSEQ_CODING_FRAME = "frame";
|
||||
|
||||
protected final static String REFSEQ_REF_CODON = "referenceCodon";
|
||||
protected final static String REFSEQ_REF_AA = "referenceAA";
|
||||
|
||||
protected final static String REFSEQ_ALT_BASE = "haplotypeAlternate";
|
||||
|
||||
protected final static String REFSEQ_VARIANT_CODON = "variantCodon";
|
||||
protected final static String REFSEQ_VARIANT_AA = "variantAA";
|
||||
protected final static String REFSEQ_CHANGES_AA = "changesAA";
|
||||
protected final static String REFSEQ_FUNCTIONAL_CLASS = "functionalClass";
|
||||
protected final static String REFSEQ_PROTEIN_COORD_DESCRIPTION = "proteinCoordStr";
|
||||
|
||||
protected final static String REFSEQ_CODING_ANNOTATIONS = "codingVariants";
|
||||
|
||||
public void initialize() {
|
||||
rodNames = new LinkedList<String>();
|
||||
rodNames.add(VARIANT_ROD_NAME);
|
||||
|
||||
locParser = getToolkit().getGenomeLocParser();
|
||||
MNPstartToStops = new TreeMap<GenomeLoc, Set<GenomeLoc>>(); // sorted by start sites
|
||||
|
||||
initializeVcfWriter();
|
||||
|
||||
locusToRefSeqFeatures = new LocusToFeatures();
|
||||
}
|
||||
|
||||
private void initializeVcfWriter() {
|
||||
sortingWriter = new ManualSortingVCFWriter(writer);
|
||||
writer = sortingWriter;
|
||||
|
||||
// setup the header fields:
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||
|
||||
Map<String, VCFHeader> rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames);
|
||||
writer.writeHeader(new VCFHeader(hInfo, new TreeSet<String>(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples())));
|
||||
}
|
||||
|
||||
public boolean generateExtendedEvents() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each site of interest, annotate it if it's a MNP.
|
||||
*
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref the reference base
|
||||
* @param context the context for the given locus
|
||||
* @return count of MNPs observed
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (tracker == null)
|
||||
return null;
|
||||
|
||||
int numMNPsObserved = 0;
|
||||
GenomeLoc curLocus = ref.getLocus();
|
||||
clearOldLocusFeatures(curLocus);
|
||||
|
||||
boolean requireStartHere = false; // see EVERY site of the MNP
|
||||
boolean takeFirstOnly = false; // take as many entries as the VCF file has
|
||||
for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) {
|
||||
if (vc.getType() == VariantContext.Type.MNP) {
|
||||
GenomeLoc MNPloc = VariantContextUtils.getLocation(locParser, vc);
|
||||
boolean atStartOfMNP = curLocus.getStart() == MNPloc.getStart();
|
||||
boolean atEndOfMNP = curLocus.getStart() == MNPloc.getStop();
|
||||
logger.debug("Observed MNP at " + MNPloc);
|
||||
|
||||
if (isChrM(vc)) {
|
||||
if (atStartOfMNP) {
|
||||
logger.warn("Skipping mitochondrial MNP at " + MNPloc + " due to complexity of coding table [need to know if first codon, etc.]...");
|
||||
writeVCF(vc);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
GenomeLoc stopLoc = locParser.createGenomeLoc(curLocus.getContig(), MNPloc.getStop());
|
||||
final List<Object> refSeqRODs = tracker.getReferenceMetaData(REFSEQ_ROD_NAME);
|
||||
for (Object refSeqObject : refSeqRODs) {
|
||||
AnnotatorInputTableFeature refSeqAnnotation = (AnnotatorInputTableFeature) refSeqObject;
|
||||
locusToRefSeqFeatures.putLocusFeatures(curLocus, refSeqAnnotation, stopLoc);
|
||||
}
|
||||
|
||||
if (atStartOfMNP) { // MNP is starting here, so register that we're waiting for it
|
||||
Set<GenomeLoc> stopLocs = MNPstartToStops.get(curLocus);
|
||||
if (stopLocs == null) {
|
||||
stopLocs = new HashSet<GenomeLoc>();
|
||||
MNPstartToStops.put(curLocus, stopLocs);
|
||||
}
|
||||
stopLocs.add(stopLoc);
|
||||
}
|
||||
|
||||
if (atEndOfMNP) {
|
||||
numMNPsObserved++; // only count a MNP at its stop site
|
||||
logger.debug("Observed end of MNP at " + curLocus);
|
||||
logger.debug("Current list of per-locus features\n" + locusToRefSeqFeatures);
|
||||
|
||||
Map<String, Object> MNPannotations = annotateMNP(vc);
|
||||
MNPannotations.putAll(vc.getAttributes());
|
||||
vc = VariantContext.modifyAttributes(vc, MNPannotations);
|
||||
writeVCF(vc);
|
||||
|
||||
GenomeLoc startLoc = locParser.createGenomeLoc(curLocus.getContig(), MNPloc.getStart());
|
||||
Set<GenomeLoc> stopLocs = MNPstartToStops.get(startLoc);
|
||||
if (stopLocs != null) { // otherwise, just removed stopLocs due to another MNP that has the same (start, stop)
|
||||
stopLocs.remove(stopLoc);
|
||||
if (stopLocs.isEmpty()) // no longer waiting for startLoc
|
||||
MNPstartToStops.remove(startLoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
writeVCF(vc);
|
||||
}
|
||||
}
|
||||
|
||||
Integer mostUpstreamWritableLoc = null;
|
||||
if (!MNPstartToStops.isEmpty()) {
|
||||
GenomeLoc waitingForLoc = MNPstartToStops.entrySet().iterator().next().getKey();
|
||||
mostUpstreamWritableLoc = waitingForLoc.getStart() - 1;
|
||||
}
|
||||
sortingWriter.setmostUpstreamWritableLocus(mostUpstreamWritableLoc);
|
||||
|
||||
return numMNPsObserved;
|
||||
}
|
||||
|
||||
private static boolean isChrM(final VariantContext vc) {
|
||||
return vc.getChr().equals("chrM") || vc.getChr().equals("MT");
|
||||
}
|
||||
|
||||
private Map<String, Object> annotateMNP(VariantContext vc) {
|
||||
Map<String, Object> annotations = new HashMap<String, Object>();
|
||||
|
||||
RefSeqNameToFeatures nameToPositionalFeatures = new RefSeqNameToFeatures(vc);
|
||||
MNPannotationKeyBuilder kb = new MNPannotationKeyBuilder(nameToPositionalFeatures);
|
||||
|
||||
for (Map.Entry<String, RefSeqFeatureList> nameToFeatureEntry : nameToPositionalFeatures.entrySet()) {
|
||||
String featureName = nameToFeatureEntry.getKey();
|
||||
RefSeqFeatureList feature = nameToFeatureEntry.getValue();
|
||||
CodonAnnotationsForAltAlleles codonAnnotationsForAlleles = new CodonAnnotationsForAltAlleles(vc, feature);
|
||||
|
||||
annotations.put(kb.getKey(REFSEQ_CODING_ANNOTATIONS), codonAnnotationsForAlleles.toString());
|
||||
annotations.put(kb.getKey(REFSEQ_NAME), featureName);
|
||||
annotations.put(kb.getKey(REFSEQ_NAME2), feature.name2);
|
||||
annotations.put(kb.getKey(REFSEQ_POSITION_TYPE), REFSEQ_CDS);
|
||||
annotations.put(kb.getKey(REFSEQ_STRAND), (feature.positiveStrand ? REFSEQ_POS_STRAND : REFSEQ_NEG_STRAND));
|
||||
annotations.put(kb.getKey(REFSEQ_CODON_COORD), feature.getCodonCoordString());
|
||||
|
||||
kb.incrementFeatureIndex();
|
||||
}
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
||||
private static class MNPannotationKeyBuilder {
|
||||
private int featureIndex;
|
||||
private boolean multipleEntries;
|
||||
|
||||
public MNPannotationKeyBuilder(RefSeqNameToFeatures nameToPositionalFeatures) {
|
||||
this.featureIndex = 1;
|
||||
this.multipleEntries = nameToPositionalFeatures.nameToFeatures.size() > 1;
|
||||
}
|
||||
|
||||
public void incrementFeatureIndex() {
|
||||
featureIndex++;
|
||||
}
|
||||
|
||||
public String getKey(String type) {
|
||||
String annotationKey = MNP_ANNOTATION_KEY_PREFIX + type;
|
||||
if (multipleEntries)
|
||||
annotationKey += "_" + featureIndex;
|
||||
return annotationKey;
|
||||
}
|
||||
}
|
||||
|
||||
private static byte[] ByteArrayToPrimitive(Byte[] nonNullArray) {
|
||||
byte[] primArray = new byte[nonNullArray.length];
|
||||
|
||||
for (int i = 0; i < nonNullArray.length; i++) {
|
||||
if (nonNullArray[i] == null)
|
||||
throw new ReviewedStingException("nonNullArray[i] == null");
|
||||
primArray[i] = nonNullArray[i];
|
||||
}
|
||||
|
||||
return primArray;
|
||||
}
|
||||
|
||||
private void clearOldLocusFeatures(GenomeLoc curLoc) {
|
||||
Iterator<Map.Entry<GenomeLoc, PositionalRefSeqFeatures>> locusFeaturesIt = locusToRefSeqFeatures.entrySet().iterator();
|
||||
while (locusFeaturesIt.hasNext()) {
|
||||
Map.Entry<GenomeLoc, PositionalRefSeqFeatures> locusFeaturesEntry = locusFeaturesIt.next();
|
||||
if (curLoc.isPast(locusFeaturesEntry.getValue().getFurthestLocusUsingFeatures()))
|
||||
locusFeaturesIt.remove();
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduce(Integer count, Integer total) {
|
||||
if (count != null)
|
||||
total = total + count;
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param result the number of MNPs processed.
|
||||
*/
|
||||
public void onTraversalDone(Integer result) {
|
||||
System.out.println("Number of MNPs observed: " + result);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void writeVCF(VariantContext vc) {
|
||||
byte refBase;
|
||||
if (!vc.isIndel()) {
|
||||
Allele varAllele = vc.getReference();
|
||||
refBase = SNPallelePair.getSingleBase(varAllele);
|
||||
}
|
||||
else {
|
||||
refBase = vc.getReferenceBaseForIndel();
|
||||
}
|
||||
|
||||
writer.add(vc, refBase);
|
||||
}
|
||||
|
||||
/*
|
||||
Inner classes:
|
||||
*/
|
||||
|
||||
// Maps: RefSeq entry name -> features for ALL positions of a particular VariantContext MNP:
|
||||
|
||||
private class RefSeqNameToFeatures {
|
||||
private Map<String, RefSeqFeatureList> nameToFeatures;
|
||||
|
||||
public RefSeqNameToFeatures(VariantContext vc) {
|
||||
this.nameToFeatures = new HashMap<String, RefSeqFeatureList>();
|
||||
|
||||
int MNPstart = vc.getStart();
|
||||
int MNPstop = vc.getEnd();
|
||||
int MNPlength = MNPstop - MNPstart + 1;
|
||||
|
||||
for (int i = 0; i < MNPlength; i++) {
|
||||
int genomicPosition = MNPstart + i;
|
||||
GenomeLoc posLoc = locParser.createGenomeLoc(vc.getChr(), genomicPosition);
|
||||
|
||||
PositionalRefSeqFeatures locFeatures = locusToRefSeqFeatures.getLocusFeatures(posLoc);
|
||||
if (locFeatures == null) // no features for posLoc
|
||||
continue;
|
||||
|
||||
for (Map.Entry<String, PositionalRefSeqFeature> nameToFeatureEntry : locFeatures.entrySet()) {
|
||||
String name = nameToFeatureEntry.getKey();
|
||||
PositionalRefSeqFeature posFeature = nameToFeatureEntry.getValue();
|
||||
|
||||
RefSeqFeatureList featureList = nameToFeatures.get(name);
|
||||
if (featureList == null) {
|
||||
featureList = new RefSeqFeatureList(MNPlength);
|
||||
nameToFeatures.put(name, featureList);
|
||||
}
|
||||
featureList.updateFeatureAtPosition(i, posFeature);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Set<Map.Entry<String, RefSeqFeatureList>> entrySet() {
|
||||
return nameToFeatures.entrySet();
|
||||
}
|
||||
}
|
||||
|
||||
// For a particular RefSeq entry, contains the features for ALL positions of a particular VariantContext MNP
|
||||
|
||||
private static class RefSeqFeatureList {
|
||||
private final static String CODON_FRAME_START = "(";
|
||||
private final static String CODON_FRAME_END = ")";
|
||||
private final static String CODON_DELIM = "|";
|
||||
|
||||
private CodingRefSeqFeature[] refSeqFeatures;
|
||||
private String name2;
|
||||
private Boolean positiveStrand;
|
||||
|
||||
private Map<Integer, List<Integer>> codonToIndices; // Map of: codon index -> MNP indices that refer to codon
|
||||
|
||||
public RefSeqFeatureList(int MNPlength) {
|
||||
this.refSeqFeatures = new CodingRefSeqFeature[MNPlength];
|
||||
for (int i = 0; i < MNPlength; i++)
|
||||
this.refSeqFeatures[i] = null;
|
||||
|
||||
this.name2 = null;
|
||||
this.positiveStrand = null;
|
||||
this.codonToIndices = new TreeMap<Integer, List<Integer>>();
|
||||
}
|
||||
|
||||
public void updateFeatureAtPosition(int index, PositionalRefSeqFeature feature) {
|
||||
if (name2 == null) {
|
||||
name2 = feature.name2;
|
||||
positiveStrand = feature.positiveStrand;
|
||||
}
|
||||
else if (!name2.equals(feature.name2) || positiveStrand != feature.positiveStrand) {
|
||||
throw new UserException("Inconsistency between previous RefSeq entry and: " + feature);
|
||||
}
|
||||
|
||||
CodingRefSeqFeature crsf = new CodingRefSeqFeature(feature);
|
||||
refSeqFeatures[index] = crsf;
|
||||
|
||||
List<Integer> indicesWithCodon = codonToIndices.get(crsf.codonCoord);
|
||||
if (indicesWithCodon == null) {
|
||||
indicesWithCodon = new LinkedList<Integer>();
|
||||
codonToIndices.put(crsf.codonCoord, indicesWithCodon);
|
||||
}
|
||||
indicesWithCodon.add(index);
|
||||
}
|
||||
|
||||
public Set<Map.Entry<Integer, List<Integer>>> codonIndicesEntrySet() {
|
||||
return codonToIndices.entrySet();
|
||||
}
|
||||
|
||||
public String getCodonCoordString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < refSeqFeatures.length; i++) {
|
||||
CodingRefSeqFeature crsf = refSeqFeatures[i];
|
||||
if (crsf != null)
|
||||
sb.append(crsf.codonCoord).append(CODON_FRAME_START).append(crsf.codingFrame).append(CODON_FRAME_END);
|
||||
if (i < refSeqFeatures.length - 1)
|
||||
sb.append(CODON_DELIM);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static class CodingRefSeqFeature {
|
||||
protected int codonCoord;
|
||||
protected int codingFrame;
|
||||
protected String referenceCodon;
|
||||
protected String referenceAA;
|
||||
|
||||
public CodingRefSeqFeature(PositionalRefSeqFeature feature) {
|
||||
this.codonCoord = feature.codonCoord;
|
||||
this.codingFrame = feature.codingFrame;
|
||||
this.referenceCodon = feature.referenceCodon.toUpperCase();
|
||||
this.referenceAA = feature.referenceAA;
|
||||
}
|
||||
}
|
||||
|
||||
private static class CodonAnnotationsForAltAlleles {
|
||||
protected final static int MIN_CODON_INDEX = 0;
|
||||
protected final static int NUM_CODON_INDICES = 3;
|
||||
private final static String CODON_ANNOTATION_DELIM = ",";
|
||||
|
||||
private List<SingleCodonAnnotationsForAlleles> alleleAnnotations;
|
||||
|
||||
public CodonAnnotationsForAltAlleles(VariantContext vc, RefSeqFeatureList feature) {
|
||||
this.alleleAnnotations = new LinkedList<SingleCodonAnnotationsForAlleles>();
|
||||
|
||||
int MNPstart = vc.getStart();
|
||||
int MNPstop = vc.getEnd();
|
||||
int MNPlength = MNPstop - MNPstart + 1;
|
||||
|
||||
for (Map.Entry<Integer, List<Integer>> codonToIndicesEntry : feature.codonIndicesEntrySet()) {
|
||||
int codonIndex = codonToIndicesEntry.getKey();
|
||||
List<Integer> indices = codonToIndicesEntry.getValue();
|
||||
if (indices.isEmpty())
|
||||
throw new ReviewedStingException("indices should not exist if it's empty!");
|
||||
|
||||
for (int index : indices) {
|
||||
int frame = feature.refSeqFeatures[index].codingFrame;
|
||||
if (feature.refSeqFeatures[index].codonCoord != codonIndex)
|
||||
throw new ReviewedStingException("LOGICAL ERROR: feature.refSeqFeatures[index].codonCoord != codonIndex");
|
||||
if (frame < MIN_CODON_INDEX || frame >= NUM_CODON_INDICES)
|
||||
throw new UserException("RefSeq codon frame not one of {0,1,2}");
|
||||
}
|
||||
CodingRefSeqFeature firstFeatureForCodon = feature.refSeqFeatures[indices.get(0)];
|
||||
String refCodon = firstFeatureForCodon.referenceCodon;
|
||||
|
||||
SingleCodonAnnotationsForAlleles codonAnnotation = new SingleCodonAnnotationsForAlleles(codonIndex, vc.getAlternateAlleles(), MNPlength, refCodon, firstFeatureForCodon, indices, feature);
|
||||
alleleAnnotations.add(codonAnnotation);
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
int index = 0;
|
||||
for (SingleCodonAnnotationsForAlleles codonToAlleles : alleleAnnotations) {
|
||||
sb.append(codonToAlleles);
|
||||
if (index < alleleAnnotations.size() - 1)
|
||||
sb.append(CODON_ANNOTATION_DELIM);
|
||||
index++;
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static class SingleCodonAnnotationsForAlleles {
|
||||
private final static String CODON_MAP_SYMBOL = "->";
|
||||
private final static String CODON_ANNOTATION_START = "[";
|
||||
private final static String CODON_ANNOTATION_END = "]";
|
||||
private final static String REF_CODON_INFO_DELIM = "|";
|
||||
private final static String ALLELE_ANNOTATION_DELIM = ",";
|
||||
private final static String ASSIGNMENT = ":";
|
||||
|
||||
private int codonIndex;
|
||||
private String refCodon;
|
||||
private String refAA;
|
||||
|
||||
private List<SingleCodonAnnotationsForAllele> annotationsForAlleles;
|
||||
|
||||
public SingleCodonAnnotationsForAlleles(int codonIndex, Collection<Allele> altAlleles, int MNPlength, String refCodon, CodingRefSeqFeature firstFeatureForCodon, List<Integer> indices, RefSeqFeatureList feature) {
|
||||
if (refCodon.length() != CodonAnnotationsForAltAlleles.NUM_CODON_INDICES)
|
||||
throw new UserException("RefSeq reference codon " + refCodon + " is not of length " + CodonAnnotationsForAltAlleles.NUM_CODON_INDICES);
|
||||
|
||||
AminoAcid refAA = AminoAcidTable.getEukaryoticAA(refCodon);
|
||||
if (!refAA.getCode().equals(firstFeatureForCodon.referenceAA))
|
||||
throw new UserException("RefSeq: translated reference codon= " + refAA + " != " + firstFeatureForCodon.referenceAA + " = reference AA");
|
||||
|
||||
this.codonIndex = codonIndex;
|
||||
this.refCodon = refCodon;
|
||||
this.refAA = refAA.getCode();
|
||||
this.annotationsForAlleles = new LinkedList<SingleCodonAnnotationsForAllele>();
|
||||
|
||||
for (Allele altAllele : altAlleles) {
|
||||
if (altAllele.length() != MNPlength)
|
||||
throw new ReviewedStingException("length(altAllele) != length(MNP)");
|
||||
byte[] altBases = altAllele.getBases();
|
||||
|
||||
Byte[] variantCodonArr = new Byte[CodonAnnotationsForAltAlleles.NUM_CODON_INDICES];
|
||||
for (int i = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; i < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; i++)
|
||||
variantCodonArr[i] = null;
|
||||
|
||||
for (int index : indices) {
|
||||
int frame = feature.refSeqFeatures[index].codingFrame;
|
||||
if (variantCodonArr[frame] != null)
|
||||
throw new UserException("RefSeq assigns codon " + codonIndex + " twice at same frame: " + frame);
|
||||
|
||||
byte base = altBases[index];
|
||||
if (!feature.positiveStrand) // negative strand codon
|
||||
base = BaseUtils.simpleComplement(base);
|
||||
|
||||
variantCodonArr[frame] = base;
|
||||
}
|
||||
|
||||
/* For missing frames, there MUST exist AT LEAST one index that refers to this codon,
|
||||
so use it to derive the missing bases [ALREADY complemented if on the negative strand]:
|
||||
*/
|
||||
for (int frame = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; frame < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; frame++) {
|
||||
if (variantCodonArr[frame] == null)
|
||||
variantCodonArr[frame] = (byte) refCodon.charAt(frame);
|
||||
}
|
||||
String variantCodon = new String(ByteArrayToPrimitive(variantCodonArr)).toUpperCase();
|
||||
|
||||
SingleCodonAnnotationsForAllele alleleAnnotation = new SingleCodonAnnotationsForAllele(variantCodon, refCodon, refAA, codonIndex);
|
||||
annotationsForAlleles.add(alleleAnnotation);
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(codonIndex).append(CODON_MAP_SYMBOL).append(CODON_ANNOTATION_START);
|
||||
sb.append(REFSEQ_REF_CODON).append(ASSIGNMENT).append(refCodon).append(REF_CODON_INFO_DELIM);
|
||||
sb.append(REFSEQ_REF_AA).append(ASSIGNMENT).append(refAA).append(REF_CODON_INFO_DELIM);
|
||||
|
||||
int index = 0;
|
||||
for (SingleCodonAnnotationsForAllele annotation : annotationsForAlleles) {
|
||||
sb.append(annotation);
|
||||
if (index < annotationsForAlleles.size() - 1)
|
||||
sb.append(ALLELE_ANNOTATION_DELIM);
|
||||
index++;
|
||||
}
|
||||
sb.append(CODON_ANNOTATION_END);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static class SingleCodonAnnotationsForAllele {
|
||||
private final static String ALLELE_START = "{";
|
||||
private final static String ALLELE_END = "}";
|
||||
private final static String CODON_INFO_DELIM = "|";
|
||||
private final static String ASSIGNMENT = ":";
|
||||
|
||||
private String variantCodon;
|
||||
private String variantAA;
|
||||
private boolean changesAA;
|
||||
private String functionalClass;
|
||||
private String proteinCoordStr;
|
||||
|
||||
public SingleCodonAnnotationsForAllele(String variantCodon, String refCodon, AminoAcid refAA, int codonIndex) {
|
||||
this.variantCodon = variantCodon;
|
||||
AminoAcid variantAA = AminoAcidTable.getEukaryoticAA(this.variantCodon);
|
||||
this.variantAA = variantAA.getCode();
|
||||
|
||||
this.changesAA = !refAA.equals(variantAA);
|
||||
|
||||
if (!this.variantCodon.equals(refCodon)) {
|
||||
if (changesAA) {
|
||||
if (variantAA.isStop()) {
|
||||
functionalClass = "nonsense";
|
||||
}
|
||||
else if (refAA.isStop()) {
|
||||
functionalClass = "readthrough";
|
||||
}
|
||||
else {
|
||||
functionalClass = "missense";
|
||||
}
|
||||
}
|
||||
else { // the same aa:
|
||||
functionalClass = "silent";
|
||||
}
|
||||
}
|
||||
else { // the same codon:
|
||||
functionalClass = "no_change";
|
||||
}
|
||||
|
||||
this.proteinCoordStr = "p." + refAA.getLetter() + codonIndex + variantAA.getLetter();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(ALLELE_START);
|
||||
sb.append(REFSEQ_VARIANT_CODON).append(ASSIGNMENT).append(variantCodon).append(CODON_INFO_DELIM);
|
||||
sb.append(REFSEQ_VARIANT_AA).append(ASSIGNMENT).append(variantAA).append(CODON_INFO_DELIM);
|
||||
sb.append(REFSEQ_CHANGES_AA).append(ASSIGNMENT).append(changesAA).append(CODON_INFO_DELIM);
|
||||
sb.append(REFSEQ_FUNCTIONAL_CLASS).append(ASSIGNMENT).append(functionalClass).append(CODON_INFO_DELIM);
|
||||
sb.append(REFSEQ_PROTEIN_COORD_DESCRIPTION).append(ASSIGNMENT).append(proteinCoordStr);
|
||||
sb.append(ALLELE_END);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// External classes:
|
||||
|
||||
class LocusToFeatures {
|
||||
private Map<GenomeLoc, PositionalRefSeqFeatures> locusToFeatures;
|
||||
|
||||
public LocusToFeatures() {
|
||||
this.locusToFeatures = new TreeMap<GenomeLoc, PositionalRefSeqFeatures>();
|
||||
}
|
||||
|
||||
public PositionalRefSeqFeatures getLocusFeatures(GenomeLoc loc) {
|
||||
return locusToFeatures.get(loc);
|
||||
}
|
||||
|
||||
public void putLocusFeatures(GenomeLoc loc, AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) {
|
||||
PositionalRefSeqFeatures locFeatures = locusToFeatures.get(loc);
|
||||
if (locFeatures == null) {
|
||||
locFeatures = new PositionalRefSeqFeatures(locusUsingThis);
|
||||
locusToFeatures.put(loc, locFeatures);
|
||||
}
|
||||
locFeatures.putFeature(refSeqAnnotation, locusUsingThis);
|
||||
}
|
||||
|
||||
public Set<Map.Entry<GenomeLoc, PositionalRefSeqFeatures>> entrySet() {
|
||||
return locusToFeatures.entrySet();
|
||||
}
|
||||
|
||||
public String toString() { // INTERNAL use only
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (Map.Entry<GenomeLoc, PositionalRefSeqFeatures> locFeatures : entrySet()) {
|
||||
GenomeLoc loc = locFeatures.getKey();
|
||||
PositionalRefSeqFeatures features = locFeatures.getValue();
|
||||
sb.append("Locus: ").append(loc).append("\n").append(features);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class PositionalRefSeqFeatures {
|
||||
private final static String[] REQUIRE_COLUMNS =
|
||||
{AnnotateMNPsWalker.REFSEQ_NAME, AnnotateMNPsWalker.REFSEQ_POSITION_TYPE};
|
||||
|
||||
private Map<String, PositionalRefSeqFeature> nameToFeature;
|
||||
private GenomeLoc furthestLocusUsingFeatures;
|
||||
|
||||
public PositionalRefSeqFeatures(GenomeLoc locusUsingThis) {
|
||||
this.nameToFeature = new HashMap<String, PositionalRefSeqFeature>();
|
||||
this.furthestLocusUsingFeatures = locusUsingThis;
|
||||
}
|
||||
|
||||
public void putFeature(AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) {
|
||||
for (String column : REQUIRE_COLUMNS) {
|
||||
if (!refSeqAnnotation.containsColumnName(column))
|
||||
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
|
||||
}
|
||||
|
||||
if (locusUsingThis.isPast(furthestLocusUsingFeatures))
|
||||
furthestLocusUsingFeatures = locusUsingThis;
|
||||
|
||||
String posType = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_POSITION_TYPE);
|
||||
if (!posType.equals(AnnotateMNPsWalker.REFSEQ_CDS)) // only interested in coding sequence annotations
|
||||
return;
|
||||
|
||||
PositionalRefSeqFeature newLocusFeature = new PositionalRefSeqFeature(refSeqAnnotation);
|
||||
|
||||
String refSeqName = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME);
|
||||
PositionalRefSeqFeature locusFeature = nameToFeature.get(refSeqName);
|
||||
if (locusFeature == null) {
|
||||
locusFeature = newLocusFeature;
|
||||
nameToFeature.put(refSeqName, locusFeature);
|
||||
}
|
||||
else if (!locusFeature.equals(newLocusFeature)) {
|
||||
throw new UserException("Inconsistency between previous RefSeq entry and: " + refSeqAnnotation);
|
||||
}
|
||||
|
||||
locusFeature.updateFeature(refSeqAnnotation);
|
||||
}
|
||||
|
||||
public GenomeLoc getFurthestLocusUsingFeatures() {
|
||||
return furthestLocusUsingFeatures;
|
||||
}
|
||||
|
||||
public Set<Map.Entry<String, PositionalRefSeqFeature>> entrySet() {
|
||||
return nameToFeature.entrySet();
|
||||
}
|
||||
|
||||
public String toString() { // INTERNAL use only
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (Map.Entry<String, PositionalRefSeqFeature> nameFeatureEntry : entrySet()) {
|
||||
String name = nameFeatureEntry.getKey();
|
||||
PositionalRefSeqFeature feature = nameFeatureEntry.getValue();
|
||||
sb.append(name).append(" -> [").append(feature).append("]\n");
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class PositionalRefSeqFeature {
|
||||
private final static String[] REQUIRE_COLUMNS =
|
||||
{AnnotateMNPsWalker.REFSEQ_NAME2, AnnotateMNPsWalker.REFSEQ_STRAND,
|
||||
AnnotateMNPsWalker.REFSEQ_CODON_COORD, AnnotateMNPsWalker.REFSEQ_CODING_FRAME,
|
||||
AnnotateMNPsWalker.REFSEQ_REF_CODON, AnnotateMNPsWalker.REFSEQ_REF_AA};
|
||||
|
||||
protected String name2;
|
||||
protected boolean positiveStrand;
|
||||
protected int codonCoord;
|
||||
protected int codingFrame;
|
||||
protected String referenceCodon;
|
||||
protected String referenceAA;
|
||||
|
||||
private Map<String, BaseAnnotations> baseToAnnotations;
|
||||
|
||||
public PositionalRefSeqFeature(AnnotatorInputTableFeature refSeqAnnotation) {
|
||||
for (String column : REQUIRE_COLUMNS) {
|
||||
if (!refSeqAnnotation.containsColumnName(column))
|
||||
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
|
||||
}
|
||||
this.name2 = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME2);
|
||||
this.positiveStrand = (refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_STRAND).equals(AnnotateMNPsWalker.REFSEQ_POS_STRAND));
|
||||
this.codonCoord = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODON_COORD));
|
||||
this.codingFrame = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODING_FRAME));
|
||||
this.referenceCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_CODON);
|
||||
this.referenceAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_AA);
|
||||
|
||||
this.baseToAnnotations = new HashMap<String, BaseAnnotations>();
|
||||
}
|
||||
|
||||
public boolean equals(PositionalRefSeqFeature that) {
|
||||
return this.name2.equals(that.name2) && this.positiveStrand == that.positiveStrand && this.codonCoord == that.codonCoord && this.codingFrame == that.codingFrame
|
||||
&& this.referenceCodon.equals(that.referenceCodon) && this.referenceAA.equals(that.referenceAA);
|
||||
}
|
||||
|
||||
public void updateFeature(AnnotatorInputTableFeature refSeqAnnotation) {
|
||||
if (!refSeqAnnotation.containsColumnName(AnnotateMNPsWalker.REFSEQ_ALT_BASE))
|
||||
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + AnnotateMNPsWalker.REFSEQ_ALT_BASE);
|
||||
String base = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_ALT_BASE);
|
||||
|
||||
baseToAnnotations.put(base, new BaseAnnotations(refSeqAnnotation));
|
||||
}
|
||||
|
||||
public String toString() { // INTERNAL use only
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("name2= ").append(name2);
|
||||
sb.append(", positiveStrand= ").append(positiveStrand);
|
||||
sb.append(", codonCoord= ").append(codonCoord);
|
||||
sb.append(", codingFrame= ").append(codingFrame);
|
||||
sb.append(", referenceCodon= ").append(referenceCodon);
|
||||
sb.append(", referenceAA= ").append(referenceAA);
|
||||
|
||||
sb.append(", baseAnnotations= {");
|
||||
for (Map.Entry<String, BaseAnnotations> baseToAnnotationsEntry : baseToAnnotations.entrySet()) {
|
||||
String base = baseToAnnotationsEntry.getKey();
|
||||
BaseAnnotations annotations = baseToAnnotationsEntry.getValue();
|
||||
sb.append(" ").append(base).append(" -> {").append(annotations).append("}");
|
||||
}
|
||||
sb.append(" }");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class BaseAnnotations {
|
||||
private final static String[] REQUIRE_COLUMNS =
|
||||
{AnnotateMNPsWalker.REFSEQ_VARIANT_CODON, AnnotateMNPsWalker.REFSEQ_VARIANT_AA,
|
||||
AnnotateMNPsWalker.REFSEQ_CHANGES_AA, AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS,
|
||||
AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION};
|
||||
|
||||
protected String variantCodon;
|
||||
protected String variantAA;
|
||||
protected boolean changesAA;
|
||||
protected String functionalClass;
|
||||
protected String proteinCoordStr;
|
||||
|
||||
public BaseAnnotations(AnnotatorInputTableFeature refSeqAnnotation) {
|
||||
for (String column : REQUIRE_COLUMNS) {
|
||||
if (!refSeqAnnotation.containsColumnName(column))
|
||||
throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column);
|
||||
}
|
||||
this.variantCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_CODON);
|
||||
this.variantAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_AA);
|
||||
this.changesAA = Boolean.parseBoolean(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CHANGES_AA));
|
||||
this.functionalClass = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS);
|
||||
this.proteinCoordStr = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION);
|
||||
}
|
||||
|
||||
|
||||
public String toString() { // INTERNAL use only
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("variantCodon= ").append(variantCodon);
|
||||
sb.append(", variantAA= ").append(variantAA);
|
||||
sb.append(", changesAA= ").append(changesAA);
|
||||
sb.append(", functionalClass= ").append(functionalClass);
|
||||
sb.append(", proteinCoordStr= ").append(proteinCoordStr);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -169,14 +169,8 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
|||
if (shouldMerge) {
|
||||
numRecordsSatisfyingMergeRule++;
|
||||
VariantContext mergedVc = VariantContextUtils.mergeIntoMNP(genomeLocParser, vcfrWaitingToMerge.vc, vc, referenceFileForMNPmerging);
|
||||
|
||||
if (mergedVc != null) {
|
||||
mergedRecords = true;
|
||||
|
||||
Map<String, Object> updatedAttribs = RefSeqData.getMergedRefSeqAttributes(vcfrWaitingToMerge.vc, vc);
|
||||
updatedAttribs.putAll(mergedVc.getAttributes());
|
||||
mergedVc = VariantContext.modifyAttributes(mergedVc, updatedAttribs);
|
||||
|
||||
vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true);
|
||||
numMergedRecords++;
|
||||
}
|
||||
|
|
@ -446,132 +440,4 @@ public class MergePhasedSegregatingAlternateAllelesVCFWriter implements VCFWrite
|
|||
return sb.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Some methods for extracting and merging RefSeq-related data from annotated VCF INFO fields:
|
||||
*/
|
||||
|
||||
class RefSeqData {
|
||||
private static String REFSEQ_PREFIX = "refseq.";
|
||||
|
||||
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
|
||||
private static String NAME_KEY = REFSEQ_PREFIX + "name";
|
||||
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
|
||||
|
||||
private static String CODON_KEY = REFSEQ_PREFIX + "codonCoord";
|
||||
|
||||
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
|
||||
Map<String, String> entriesToNames = new HashMap<String, String>();
|
||||
|
||||
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
|
||||
if (numRecords != null) {
|
||||
for (int i = 1; i <= numRecords; i++) {
|
||||
String key = NAME_KEY + "_" + i;
|
||||
String name = vc.getAttributeAsStringNoException(key);
|
||||
if (name != null)
|
||||
entriesToNames.put(key, name);
|
||||
}
|
||||
}
|
||||
else {
|
||||
String name = vc.getAttributeAsStringNoException(NAME_KEY);
|
||||
if (name != null) {
|
||||
entriesToNames.put(NAME_KEY, name);
|
||||
}
|
||||
else { // Check all INFO fields for a match:
|
||||
for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
|
||||
String key = entry.getKey();
|
||||
if (getName2 && key.startsWith(NAME2_KEY))
|
||||
entriesToNames.put(key, entry.getValue().toString());
|
||||
else if (key.startsWith(NAME_KEY) && !key.startsWith(NAME2_KEY))
|
||||
entriesToNames.put(key, entry.getValue().toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return entriesToNames;
|
||||
}
|
||||
|
||||
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
|
||||
return getRefSeqEntriesToNames(vc, false);
|
||||
}
|
||||
|
||||
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
|
||||
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
|
||||
}
|
||||
|
||||
public static Set<String> getRefSeqNames(VariantContext vc) {
|
||||
return getRefSeqNames(vc, false);
|
||||
}
|
||||
|
||||
public static Map<String, Object> getMergedRefSeqAttributes(VariantContext vc1, VariantContext vc2) {
|
||||
Map<String, Object> refSeqAttribs = new HashMap<String, Object>();
|
||||
|
||||
List<RefSeqEntry> list1 = getAllRefSeqEntries(vc1);
|
||||
List<RefSeqEntry> list2 = getAllRefSeqEntries(vc2);
|
||||
boolean addSuffix = list1.size() > 1 || list2.size() > 1;
|
||||
int count = 1;
|
||||
|
||||
for (RefSeqEntry refseq1 : list1) {
|
||||
for (RefSeqEntry refseq2 : list2) {
|
||||
Set<String> keys = new HashSet<String>();
|
||||
keys.addAll(refseq1.info.keySet());
|
||||
keys.addAll(refseq2.info.keySet());
|
||||
|
||||
String keySuffix = "";
|
||||
if (addSuffix)
|
||||
keySuffix = "_" + count++;
|
||||
|
||||
Object name1 = refseq1.info.get(NAME_KEY);
|
||||
Object name2 = refseq2.info.get(NAME_KEY);
|
||||
boolean sameGene = name1 != null && name2 != null && name1.equals(name2);
|
||||
|
||||
for (String key : keys) {
|
||||
Object obj1 = refseq1.info.get(key);
|
||||
Object obj2 = refseq2.info.get(key);
|
||||
if (obj1 == null)
|
||||
obj1 = "";
|
||||
if (obj2 == null)
|
||||
obj2 = "";
|
||||
|
||||
if (sameGene && key.equals(CODON_KEY) && obj1.equals(obj2)) // vc1 and vc2 have variants in the same codon in the same gene
|
||||
System.out.println(vc1.getChr() + ":" + vc1.getStart() + " --> CODON: obj1 = " + obj1);
|
||||
|
||||
String useKey = key + keySuffix;
|
||||
String mergedVal = obj1 + "\\" + obj2;
|
||||
refSeqAttribs.put(useKey, mergedVal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return refSeqAttribs;
|
||||
}
|
||||
|
||||
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
|
||||
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
|
||||
|
||||
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
|
||||
String entry = entryToName.getKey();
|
||||
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
|
||||
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
|
||||
}
|
||||
|
||||
return allRefSeq;
|
||||
}
|
||||
|
||||
private static class RefSeqEntry {
|
||||
public Map<String, Object> info;
|
||||
|
||||
public RefSeqEntry(VariantContext vc, String entrySuffix) {
|
||||
this.info = new HashMap<String, Object>();
|
||||
|
||||
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
|
||||
String key = attribEntry.getKey();
|
||||
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
|
||||
String genericKey = key.replaceAll(entrySuffix, "");
|
||||
this.info.put(genericKey, attribEntry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -70,12 +70,12 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
|
|||
@Argument(fullName = "disablePrintAltAlleleStats", shortName = "noAlleleStats", doc = "Should the print-out of alternate allele statistics be disabled?; [default:false]", required = false)
|
||||
protected boolean disablePrintAlternateAlleleStatistics = false;
|
||||
|
||||
public final static String IGNORE_CODING = "IGNORE";
|
||||
public final static String UNION_CODING = "UNION";
|
||||
public final static String INTERSECT_CODING = "INTERSECT";
|
||||
public final static String IGNORE_REFSEQ = "IGNORE";
|
||||
public final static String UNION_REFSEQ = "UNION";
|
||||
public final static String INTERSECT_REFSEQ = "INTERSECT";
|
||||
|
||||
@Argument(fullName = "mergeBasedOnCodingAnnotation", shortName = "mergeBasedOnCodingAnnotation", doc = "'Should merging be performed if two sites lie on the same coding sequence in the INFO field {" + IGNORE_CODING + ", " + UNION_CODING + ", " + INTERSECT_CODING + "}; [default:"+ IGNORE_CODING + "]", required = false)
|
||||
protected String mergeBasedOnCodingAnnotation = IGNORE_CODING;
|
||||
@Argument(fullName = "mergeBasedOnRefSeqAnnotation", shortName = "mergeBasedOnRefSeqAnnotation", doc = "'Should merging be performed if two sites lie on the same RefSeq sequence in the INFO field {" + IGNORE_REFSEQ + ", " + UNION_REFSEQ + ", " + INTERSECT_REFSEQ + "}; [default:"+ IGNORE_REFSEQ + "]", required = false)
|
||||
protected String mergeBasedOnRefSeqAnnotation = IGNORE_REFSEQ;
|
||||
|
||||
private LinkedList<String> rodNames = null;
|
||||
|
||||
|
|
@ -90,10 +90,10 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
|
|||
GenomeLocParser genomeLocParser = getToolkit().getGenomeLocParser();
|
||||
|
||||
MergeRule mergeRule = null;
|
||||
if (mergeBasedOnCodingAnnotation.equals(IGNORE_CODING))
|
||||
if (mergeBasedOnRefSeqAnnotation.equals(IGNORE_REFSEQ))
|
||||
mergeRule = new DistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser);
|
||||
else
|
||||
mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnCodingAnnotation);
|
||||
mergeRule = new SameGenePlusWithinDistanceMergeRule(maxGenomicDistanceForMNP, genomeLocParser, mergeBasedOnRefSeqAnnotation);
|
||||
|
||||
// false <-> don't take control of writer, since didn't create it:
|
||||
vcMergerWriter = new MergePhasedSegregatingAlternateAllelesVCFWriter(writer,genomeLocParser, getToolkit().getArguments().referenceFile, mergeRule, useSingleSample, emitOnlyMergedRecords, logger, false, !disablePrintAlternateAlleleStatistics);
|
||||
|
|
@ -175,7 +175,7 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker<Integer, I
|
|||
}
|
||||
|
||||
|
||||
enum MergeBasedOnCodingAnnotation {
|
||||
enum MergeBasedOnRefSeqAnnotation {
|
||||
UNION_WITH_DIST, INTERSECT_WITH_DIST
|
||||
}
|
||||
|
||||
|
|
@ -206,45 +206,45 @@ class DistanceMergeRule implements MergeRule {
|
|||
}
|
||||
|
||||
class SameGenePlusWithinDistanceMergeRule extends DistanceMergeRule {
|
||||
private MergeBasedOnCodingAnnotation mergeBasedOnCodingAnnotation;
|
||||
private MergeBasedOnRefSeqAnnotation mergeBasedOnRefSeqAnnotation;
|
||||
|
||||
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnCodingAnnotation) {
|
||||
public SameGenePlusWithinDistanceMergeRule(int maxGenomicDistanceForMNP, GenomeLocParser genomeLocParser, String mergeBasedOnRefSeqAnnotation) {
|
||||
super(maxGenomicDistanceForMNP, genomeLocParser);
|
||||
|
||||
if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_CODING))
|
||||
this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.UNION_WITH_DIST;
|
||||
else if (mergeBasedOnCodingAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING))
|
||||
this.mergeBasedOnCodingAnnotation = MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST;
|
||||
if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ))
|
||||
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST;
|
||||
else if (mergeBasedOnRefSeqAnnotation.equals(MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ))
|
||||
this.mergeBasedOnRefSeqAnnotation = MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST;
|
||||
else
|
||||
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_CODING + ", " + MergeSegregatingAlternateAllelesWalker.UNION_CODING + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_CODING + " as argument to mergeBasedOnCodingAnnotation!");
|
||||
throw new UserException("Must provide " + MergeSegregatingAlternateAllelesWalker.IGNORE_REFSEQ + ", " + MergeSegregatingAlternateAllelesWalker.UNION_REFSEQ + ", or " + MergeSegregatingAlternateAllelesWalker.INTERSECT_REFSEQ + " as argument to mergeBasedOnRefSeqAnnotation!");
|
||||
}
|
||||
|
||||
public boolean shouldMerge(VariantContext vc1, VariantContext vc2) {
|
||||
boolean withinDistance = super.shouldMerge(vc1, vc2);
|
||||
|
||||
if (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST)
|
||||
if (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST)
|
||||
return withinDistance || sameGene(vc1, vc2);
|
||||
else // mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.INTERSECT_WITH_DIST
|
||||
else // mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.INTERSECT_WITH_DIST
|
||||
return withinDistance && sameGene(vc1, vc2);
|
||||
}
|
||||
|
||||
private boolean sameGene(VariantContext vc1, VariantContext vc2) {
|
||||
Set<String> names_vc1 = RefSeqData.getRefSeqNames(vc1);
|
||||
Set<String> names_vc2 = RefSeqData.getRefSeqNames(vc2);
|
||||
Set<String> names_vc1 = RefSeqDataParser.getRefSeqNames(vc1);
|
||||
Set<String> names_vc2 = RefSeqDataParser.getRefSeqNames(vc2);
|
||||
names_vc1.retainAll(names_vc2);
|
||||
|
||||
if (!names_vc1.isEmpty())
|
||||
return true;
|
||||
|
||||
// Check refseq.name2:
|
||||
Set<String> names2_vc1 = RefSeqData.getRefSeqNames(vc1, true);
|
||||
Set<String> names2_vc2 = RefSeqData.getRefSeqNames(vc2, true);
|
||||
Set<String> names2_vc1 = RefSeqDataParser.getRefSeqNames(vc1, true);
|
||||
Set<String> names2_vc2 = RefSeqDataParser.getRefSeqNames(vc2, true);
|
||||
names2_vc1.retainAll(names2_vc2);
|
||||
|
||||
return !names2_vc1.isEmpty();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return super.toString() + " " + (mergeBasedOnCodingAnnotation == MergeBasedOnCodingAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
|
||||
return super.toString() + " " + (mergeBasedOnRefSeqAnnotation == MergeBasedOnRefSeqAnnotation.UNION_WITH_DIST ? "OR" : "AND") + " on the same gene";
|
||||
}
|
||||
}
|
||||
|
|
@ -543,7 +543,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
private class EdgeToReads {
|
||||
private Map<GraphEdge, List<String>> edgeReads;
|
||||
private TreeMap<GraphEdge, List<String>> edgeReads;
|
||||
|
||||
public EdgeToReads() {
|
||||
this.edgeReads = new TreeMap<GraphEdge, List<String>>(); // implemented GraphEdge.compareTo()
|
||||
|
|
@ -1196,7 +1196,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
}
|
||||
|
||||
public static PhasingTable marginalizeAsNewTable(PhasingTable table) {
|
||||
Map<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
|
||||
TreeMap<Haplotype, PreciseNonNegativeDouble> hapMap = new TreeMap<Haplotype, PreciseNonNegativeDouble>();
|
||||
for (PhasingTable.PhasingTableEntry pte : table) {
|
||||
Haplotype rep = pte.getHaplotypeClass().getRepresentative();
|
||||
PreciseNonNegativeDouble score = hapMap.get(rep);
|
||||
|
|
@ -1383,7 +1383,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
private class MultipleBaseCountsWriter {
|
||||
private BufferedWriter writer = null;
|
||||
private Map<SampleReadLocus, MultipleBaseCounts> multipleBaseCounts = null;
|
||||
private TreeMap<SampleReadLocus, MultipleBaseCounts> multipleBaseCounts = null;
|
||||
|
||||
public MultipleBaseCountsWriter(File outputMultipleBaseCountsFile) {
|
||||
FileOutputStream output;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import java.util.*;
|
||||
|
||||
/* Some methods for extracting RefSeq-related data from annotated VCF INFO fields:
|
||||
*/
|
||||
public class RefSeqDataParser {
|
||||
private static String REFSEQ_PREFIX = "refseq.";
|
||||
|
||||
private static String NUM_RECORDS_KEY = REFSEQ_PREFIX + "numMatchingRecords";
|
||||
private static String NAME_KEY = REFSEQ_PREFIX + "name";
|
||||
private static String NAME2_KEY = REFSEQ_PREFIX + "name2";
|
||||
|
||||
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc, boolean getName2) {
|
||||
String nameKeyToUse = getName2 ? NAME2_KEY : NAME_KEY;
|
||||
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
|
||||
|
||||
Map<String, String> entriesToNames = new HashMap<String, String>();
|
||||
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
|
||||
if (numRecords != null) {
|
||||
for (int i = 1; i <= numRecords; i++) {
|
||||
String key = nameKeyToUseMultiplePrefix + i;
|
||||
String name = vc.getAttributeAsStringNoException(key);
|
||||
if (name != null)
|
||||
entriesToNames.put(key, name);
|
||||
}
|
||||
}
|
||||
else { // no entry with the # of records:
|
||||
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
|
||||
if (name != null) {
|
||||
entriesToNames.put(nameKeyToUse, name);
|
||||
}
|
||||
else { // Check all INFO fields for a match (if there are multiple entries):
|
||||
for (Map.Entry<String, Object> entry : vc.getAttributes().entrySet()) {
|
||||
String key = entry.getKey();
|
||||
if (key.startsWith(nameKeyToUseMultiplePrefix))
|
||||
entriesToNames.put(key, entry.getValue().toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
return entriesToNames;
|
||||
}
|
||||
|
||||
private static Map<String, String> getRefSeqEntriesToNames(VariantContext vc) {
|
||||
return getRefSeqEntriesToNames(vc, false);
|
||||
}
|
||||
|
||||
public static Set<String> getRefSeqNames(VariantContext vc, boolean getName2) {
|
||||
return new TreeSet<String>(getRefSeqEntriesToNames(vc, getName2).values());
|
||||
}
|
||||
|
||||
public static Set<String> getRefSeqNames(VariantContext vc) {
|
||||
return getRefSeqNames(vc, false);
|
||||
}
|
||||
|
||||
// Returns a List of SEPARATE Map<refseq.ENTRY, refseq.VALUE> for EACH RefSeq annotation (i.e., each gene), stripping out the "_1", "_2", etc.
|
||||
private static List<RefSeqEntry> getAllRefSeqEntries(VariantContext vc) {
|
||||
List<RefSeqEntry> allRefSeq = new LinkedList<RefSeqEntry>();
|
||||
|
||||
for (Map.Entry<String, String> entryToName : getRefSeqEntriesToNames(vc).entrySet()) {
|
||||
String entry = entryToName.getKey();
|
||||
String entrySuffix = entry.replaceFirst(NAME_KEY, "");
|
||||
allRefSeq.add(new RefSeqEntry(vc, entrySuffix));
|
||||
}
|
||||
|
||||
return allRefSeq;
|
||||
}
|
||||
|
||||
private static class RefSeqEntry {
|
||||
public Map<String, Object> info;
|
||||
|
||||
public RefSeqEntry(VariantContext vc, String entrySuffix) {
|
||||
this.info = new HashMap<String, Object>();
|
||||
|
||||
for (Map.Entry<String, Object> attribEntry : vc.getAttributes().entrySet()) {
|
||||
String key = attribEntry.getKey();
|
||||
if (key.startsWith(REFSEQ_PREFIX) && key.endsWith(entrySuffix)) {
|
||||
String genericKey = key.replaceAll(entrySuffix, "");
|
||||
this.info.put(genericKey, attribEntry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue