Experimental tool to annotates indels that are provided in a VCF file based on RefGene. Specifies gene, transcript, strand, type (Non-frameshift, frameshift, 5'-UTR, 3'-UTR, SpliceSiteDisruption, Intron, or Unknown).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4191 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dc5f858d29
commit
acd6bd2430
|
|
@ -15,6 +15,8 @@ import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuil
|
|||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
|
@ -23,47 +25,15 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Apr 21, 2010
|
||||
*/
|
||||
public class IndelAnnotator extends RodWalker<Integer,Long>{
|
||||
|
||||
public class IndelAnnotator extends RodWalker<Integer,Long> {
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter vcfWriter = null;
|
||||
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=true)
|
||||
@Argument(fullName="refseq", shortName="refseq", doc="Name of RefSeq transcript annotation file", required=true)
|
||||
String RefseqFileName = null;
|
||||
|
||||
private static String annGenomic = "GENOMIC";
|
||||
private static String annIntron = "INTRON";
|
||||
private static String annUTR = "UTR";
|
||||
private static String annCoding = "CODING";
|
||||
private static String annUnknown = "UNKNOWN";
|
||||
|
||||
private SeekableRODIterator refseqIterator;
|
||||
|
||||
private String getAnnotationString(RODRecordList ann) {
|
||||
if ( ann == null ) return annGenomic;
|
||||
else {
|
||||
StringBuilder b = new StringBuilder();
|
||||
|
||||
if ( RefSeqFeature.isExon(ann) ) {
|
||||
if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
|
||||
else b.append(annUTR); // exon but not coding = UTR
|
||||
} else {
|
||||
if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
|
||||
else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
|
||||
}
|
||||
b.append('\t');
|
||||
b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed
|
||||
return b.toString();
|
||||
}
|
||||
}
|
||||
|
||||
public void initialize() {
|
||||
if ( RefseqFileName != null ) {
|
||||
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
|
||||
|
|
@ -75,17 +45,26 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
|
|||
throw new StingException("Unable to open file " + RefseqFileName, e);
|
||||
}
|
||||
|
||||
logger.info("Using RefSeq annotations from "+RefseqFileName);
|
||||
logger.info("Using RefSeq annotations from " + RefseqFileName);
|
||||
}
|
||||
|
||||
if ( refseqIterator == null ) logger.info("No annotations available");
|
||||
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("source", "IndelAnnotator"));
|
||||
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
|
||||
|
||||
HashSet<VCFInfoHeaderLine> anno = new HashSet<VCFInfoHeaderLine>();
|
||||
anno.add(new VCFInfoHeaderLine("type",1, VCFHeaderLineType.String,"Genomic interpretation (according to RefSeq)"));
|
||||
anno.add(new VCFInfoHeaderLine("cDNAchange", 1, VCFHeaderLineType.String, "cDNAchange"));
|
||||
anno.add(new VCFInfoHeaderLine("classification", 1, VCFHeaderLineType.String, "classification"));
|
||||
anno.add(new VCFInfoHeaderLine("codonchange", 1, VCFHeaderLineType.String, "codonchange"));
|
||||
anno.add(new VCFInfoHeaderLine("gene", 1, VCFHeaderLineType.String, "gene"));
|
||||
anno.add(new VCFInfoHeaderLine("genomechange", 1, VCFHeaderLineType.String, "genomechange"));
|
||||
anno.add(new VCFInfoHeaderLine("proteinchange", 1, VCFHeaderLineType.String, "proteinchange"));
|
||||
anno.add(new VCFInfoHeaderLine("strand", 1, VCFHeaderLineType.String, "strand"));
|
||||
anno.add(new VCFInfoHeaderLine("transcript", 1, VCFHeaderLineType.String, "transcript"));
|
||||
anno.add(new VCFInfoHeaderLine("type", 1, VCFHeaderLineType.String, "type"));
|
||||
hInfo.addAll(anno);
|
||||
|
||||
VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit()));
|
||||
|
|
@ -96,21 +75,118 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
|
|||
return 0l;
|
||||
}
|
||||
|
||||
private TreeMap<String, Object> getAnnotationMap(VariantContext vc, VariantContext dbsnp, RODRecordList ann) {
|
||||
TreeMap<String, Object> anns = new TreeMap<String, Object>();
|
||||
anns.put("gene", "---");
|
||||
anns.put("type", "IGR");
|
||||
anns.put("transcript", "---");
|
||||
anns.put("strand", "+");
|
||||
anns.put("proteinchange", "---");
|
||||
anns.put("genomechange", "---");
|
||||
anns.put("codonchange", "---");
|
||||
anns.put("cDNAchange", "---");
|
||||
|
||||
if (dbsnp != null) {
|
||||
anns.put("ID", dbsnp.getAttribute("ID"));
|
||||
}
|
||||
|
||||
if (vc.isIndel()) {
|
||||
anns.put("classification", vc.isInsertion() ? "INS" : "DEL");
|
||||
}
|
||||
|
||||
if ( ann != null ) {
|
||||
TreeMap<Integer, TreeMap<String, Object>> deleteriousnessRankedAnnotations = new TreeMap<Integer, TreeMap<String, Object>>();
|
||||
|
||||
for (int transcriptIndex = 0; transcriptIndex < ann.size(); transcriptIndex++) {
|
||||
Transcript t = (Transcript) ann.get(transcriptIndex).getUnderlyingObject();
|
||||
TreeMap<String, Object> plausibleAnnotations = new TreeMap<String, Object>();
|
||||
Integer rank = 0;
|
||||
|
||||
plausibleAnnotations.put("gene", t.getGeneName());
|
||||
plausibleAnnotations.put("transcript", t.getTranscriptId());
|
||||
plausibleAnnotations.put("strand", t.getStrand() == -1 ? "-" : "+");
|
||||
|
||||
if ( RefSeqFeature.isExon(ann) ) {
|
||||
if ( RefSeqFeature.isCodingExon(ann) ) {
|
||||
//b.append(annCoding); // both exon and coding = coding exon sequence
|
||||
if (vc.getIndelLengths().get(0) % 3 == 0) {
|
||||
plausibleAnnotations.put("type", "Non-frameshift");
|
||||
rank = 4;
|
||||
} else {
|
||||
plausibleAnnotations.put("type", "Frameshift");
|
||||
rank = 0;
|
||||
}
|
||||
} else {
|
||||
//b.append(annUTR); // exon but not coding = UTR
|
||||
if (t.getStrand() == 1) {
|
||||
plausibleAnnotations.put("type", "5'-UTR");
|
||||
rank = 2;
|
||||
} else {
|
||||
plausibleAnnotations.put("type", "3'-UTR");
|
||||
rank = 3;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ( RefSeqFeature.isCoding(ann) ) {
|
||||
//b.append(annIntron); // not in exon, but within the coding region = intron
|
||||
GenomeLoc ig = GenomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd());
|
||||
GenomeLoc cl = t.getCodingLocation();
|
||||
GenomeLoc g = t.getLocation();
|
||||
|
||||
boolean spliceSiteDisruption = false;
|
||||
|
||||
for (GenomeLoc exon : t.getExons()) {
|
||||
GenomeLoc expandedExon = GenomeLocParser.createGenomeLoc(exon.getContig(), exon.getStart() - 6, exon.getStop() + 6);
|
||||
|
||||
if (ig.overlapsP(expandedExon)) {
|
||||
spliceSiteDisruption = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (spliceSiteDisruption) {
|
||||
plausibleAnnotations.put("type", "SpliceSiteDisruption");
|
||||
rank = 1;
|
||||
} else {
|
||||
plausibleAnnotations.put("type", "Intron");
|
||||
rank = 5;
|
||||
}
|
||||
} else {
|
||||
//b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
|
||||
plausibleAnnotations.put("type", "Unknown");
|
||||
rank = 6;
|
||||
}
|
||||
}
|
||||
|
||||
deleteriousnessRankedAnnotations.put(rank, plausibleAnnotations);
|
||||
}
|
||||
|
||||
anns.putAll(deleteriousnessRankedAnnotations.firstEntry().getValue());
|
||||
}
|
||||
|
||||
|
||||
return anns;
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
if ( tracker == null ) { return 0; }
|
||||
|
||||
VariantContext vc = tracker.getVariantContext(ref, "variant", null, con.getLocation(), true);
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return 0;
|
||||
if ( vc == null ) { return 0; }
|
||||
|
||||
RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(ref.getLocus()));
|
||||
String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
|
||||
annotationString = annotationString.split("\\s+")[0];
|
||||
Collection<VariantContext> dbsnps = tracker.getVariantContexts(ref, "dbsnp", null, con.getLocation(), true, true);
|
||||
VariantContext dbsnp = null;
|
||||
|
||||
if (dbsnps != null && dbsnps.size() > 0) {
|
||||
ArrayList<VariantContext> dbsnpsarray = new ArrayList<VariantContext>(dbsnps);
|
||||
dbsnp = dbsnpsarray.get(0);
|
||||
}
|
||||
|
||||
RODRecordList annotationList = refseqIterator.seekForward(ref.getLocus());
|
||||
TreeMap<String, Object> annotationMap = getAnnotationMap(vc, dbsnp, annotationList);
|
||||
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
attrs.put("type",annotationString);
|
||||
attrs.putAll(annotationMap);
|
||||
|
||||
vc = VariantContextUtils.modifyAttributes(vc, attrs);
|
||||
vcfWriter.add(vc, ref.getBase());
|
||||
|
||||
|
|
@ -120,9 +196,4 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
|
|||
public Long reduce(Integer i, Long j) {
|
||||
return i + j;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Long l) {
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue