Experimental tool to annotates indels that are provided in a VCF file based on RefGene. Specifies gene, transcript, strand, type (Non-frameshift, frameshift, 5'-UTR, 3'-UTR, SpliceSiteDisruption, Intron, or Unknown).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4191 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2010-09-01 23:30:28 +00:00
parent dc5f858d29
commit acd6bd2430
1 changed files with 122 additions and 51 deletions

View File

@ -15,6 +15,8 @@ import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuil
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
@ -23,47 +25,15 @@ import java.io.File;
import java.io.IOException;
import java.util.*;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
*
* @Author chartl
* @Date Apr 21, 2010
*/
public class IndelAnnotator extends RodWalker<Integer,Long>{
public class IndelAnnotator extends RodWalker<Integer,Long> {
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="refseq", shortName="refseq",
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=true)
@Argument(fullName="refseq", shortName="refseq", doc="Name of RefSeq transcript annotation file", required=true)
String RefseqFileName = null;
private static String annGenomic = "GENOMIC";
private static String annIntron = "INTRON";
private static String annUTR = "UTR";
private static String annCoding = "CODING";
private static String annUnknown = "UNKNOWN";
private SeekableRODIterator refseqIterator;
private String getAnnotationString(RODRecordList ann) {
if ( ann == null ) return annGenomic;
else {
StringBuilder b = new StringBuilder();
if ( RefSeqFeature.isExon(ann) ) {
if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence
else b.append(annUTR); // exon but not coding = UTR
} else {
if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron
else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
}
b.append('\t');
b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed
return b.toString();
}
}
public void initialize() {
if ( RefseqFileName != null ) {
TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder();
@ -75,17 +45,26 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
throw new StingException("Unable to open file " + RefseqFileName, e);
}
logger.info("Using RefSeq annotations from "+RefseqFileName);
logger.info("Using RefSeq annotations from " + RefseqFileName);
}
if ( refseqIterator == null ) logger.info("No annotations available");
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "IndelAnnotator"));
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
HashSet<VCFInfoHeaderLine> anno = new HashSet<VCFInfoHeaderLine>();
anno.add(new VCFInfoHeaderLine("type",1, VCFHeaderLineType.String,"Genomic interpretation (according to RefSeq)"));
anno.add(new VCFInfoHeaderLine("cDNAchange", 1, VCFHeaderLineType.String, "cDNAchange"));
anno.add(new VCFInfoHeaderLine("classification", 1, VCFHeaderLineType.String, "classification"));
anno.add(new VCFInfoHeaderLine("codonchange", 1, VCFHeaderLineType.String, "codonchange"));
anno.add(new VCFInfoHeaderLine("gene", 1, VCFHeaderLineType.String, "gene"));
anno.add(new VCFInfoHeaderLine("genomechange", 1, VCFHeaderLineType.String, "genomechange"));
anno.add(new VCFInfoHeaderLine("proteinchange", 1, VCFHeaderLineType.String, "proteinchange"));
anno.add(new VCFInfoHeaderLine("strand", 1, VCFHeaderLineType.String, "strand"));
anno.add(new VCFInfoHeaderLine("transcript", 1, VCFHeaderLineType.String, "transcript"));
anno.add(new VCFInfoHeaderLine("type", 1, VCFHeaderLineType.String, "type"));
hInfo.addAll(anno);
VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit()));
@ -96,21 +75,118 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
return 0l;
}
private TreeMap<String, Object> getAnnotationMap(VariantContext vc, VariantContext dbsnp, RODRecordList ann) {
TreeMap<String, Object> anns = new TreeMap<String, Object>();
anns.put("gene", "---");
anns.put("type", "IGR");
anns.put("transcript", "---");
anns.put("strand", "+");
anns.put("proteinchange", "---");
anns.put("genomechange", "---");
anns.put("codonchange", "---");
anns.put("cDNAchange", "---");
if (dbsnp != null) {
anns.put("ID", dbsnp.getAttribute("ID"));
}
if (vc.isIndel()) {
anns.put("classification", vc.isInsertion() ? "INS" : "DEL");
}
if ( ann != null ) {
TreeMap<Integer, TreeMap<String, Object>> deleteriousnessRankedAnnotations = new TreeMap<Integer, TreeMap<String, Object>>();
for (int transcriptIndex = 0; transcriptIndex < ann.size(); transcriptIndex++) {
Transcript t = (Transcript) ann.get(transcriptIndex).getUnderlyingObject();
TreeMap<String, Object> plausibleAnnotations = new TreeMap<String, Object>();
Integer rank = 0;
plausibleAnnotations.put("gene", t.getGeneName());
plausibleAnnotations.put("transcript", t.getTranscriptId());
plausibleAnnotations.put("strand", t.getStrand() == -1 ? "-" : "+");
if ( RefSeqFeature.isExon(ann) ) {
if ( RefSeqFeature.isCodingExon(ann) ) {
//b.append(annCoding); // both exon and coding = coding exon sequence
if (vc.getIndelLengths().get(0) % 3 == 0) {
plausibleAnnotations.put("type", "Non-frameshift");
rank = 4;
} else {
plausibleAnnotations.put("type", "Frameshift");
rank = 0;
}
} else {
//b.append(annUTR); // exon but not coding = UTR
if (t.getStrand() == 1) {
plausibleAnnotations.put("type", "5'-UTR");
rank = 2;
} else {
plausibleAnnotations.put("type", "3'-UTR");
rank = 3;
}
}
} else {
if ( RefSeqFeature.isCoding(ann) ) {
//b.append(annIntron); // not in exon, but within the coding region = intron
GenomeLoc ig = GenomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd());
GenomeLoc cl = t.getCodingLocation();
GenomeLoc g = t.getLocation();
boolean spliceSiteDisruption = false;
for (GenomeLoc exon : t.getExons()) {
GenomeLoc expandedExon = GenomeLocParser.createGenomeLoc(exon.getContig(), exon.getStart() - 6, exon.getStop() + 6);
if (ig.overlapsP(expandedExon)) {
spliceSiteDisruption = true;
}
}
if (spliceSiteDisruption) {
plausibleAnnotations.put("type", "SpliceSiteDisruption");
rank = 1;
} else {
plausibleAnnotations.put("type", "Intron");
rank = 5;
}
} else {
//b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon...
plausibleAnnotations.put("type", "Unknown");
rank = 6;
}
}
deleteriousnessRankedAnnotations.put(rank, plausibleAnnotations);
}
anns.putAll(deleteriousnessRankedAnnotations.firstEntry().getValue());
}
return anns;
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) {
if ( tracker == null )
return 0;
if ( tracker == null ) { return 0; }
VariantContext vc = tracker.getVariantContext(ref, "variant", null, con.getLocation(), true);
// ignore places where we don't have a variant
if ( vc == null )
return 0;
if ( vc == null ) { return 0; }
RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(ref.getLocus()));
String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
annotationString = annotationString.split("\\s+")[0];
Collection<VariantContext> dbsnps = tracker.getVariantContexts(ref, "dbsnp", null, con.getLocation(), true, true);
VariantContext dbsnp = null;
if (dbsnps != null && dbsnps.size() > 0) {
ArrayList<VariantContext> dbsnpsarray = new ArrayList<VariantContext>(dbsnps);
dbsnp = dbsnpsarray.get(0);
}
RODRecordList annotationList = refseqIterator.seekForward(ref.getLocus());
TreeMap<String, Object> annotationMap = getAnnotationMap(vc, dbsnp, annotationList);
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
attrs.put("type",annotationString);
attrs.putAll(annotationMap);
vc = VariantContextUtils.modifyAttributes(vc, attrs);
vcfWriter.add(vc, ref.getBase());
@ -120,9 +196,4 @@ public class IndelAnnotator extends RodWalker<Integer,Long>{
public Long reduce(Integer i, Long j) {
return i + j;
}
public void onTraversalDone(Long l) {
return;
}
}