From acd6bd24309d52d743a9e1b6ef4861613e100171 Mon Sep 17 00:00:00 2001 From: kiran Date: Wed, 1 Sep 2010 23:30:28 +0000 Subject: [PATCH] Experimental tool to annotates indels that are provided in a VCF file based on RefGene. Specifies gene, transcript, strand, type (Non-frameshift, frameshift, 5'-UTR, 3'-UTR, SpliceSiteDisruption, Intron, or Unknown). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4191 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/IndelAnnotator.java | 173 ++++++++++++------ 1 file changed, 122 insertions(+), 51 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java index 58cc2f213..2475b13ca 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java @@ -15,6 +15,8 @@ import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuil import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.vcf.VCFUtils; @@ -23,47 +25,15 @@ import java.io.File; import java.io.IOException; import java.util.*; -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date Apr 21, 2010 - */ -public class IndelAnnotator extends RodWalker{ - +public class IndelAnnotator extends RodWalker { @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="refseq", shortName="refseq", - doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=true) + @Argument(fullName="refseq", shortName="refseq", doc="Name of RefSeq transcript annotation file", required=true) String RefseqFileName = null; - private static String annGenomic = "GENOMIC"; - private static String annIntron = "INTRON"; - private static String annUTR = "UTR"; - private static String annCoding = "CODING"; - private static String annUnknown = "UNKNOWN"; - private SeekableRODIterator refseqIterator; - private String getAnnotationString(RODRecordList ann) { - if ( ann == null ) return annGenomic; - else { - StringBuilder b = new StringBuilder(); - - if ( RefSeqFeature.isExon(ann) ) { - if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence - else b.append(annUTR); // exon but not coding = UTR - } else { - if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron - else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... - } - b.append('\t'); - b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed - return b.toString(); - } - } - public void initialize() { if ( RefseqFileName != null ) { TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); @@ -75,17 +45,26 @@ public class IndelAnnotator extends RodWalker{ throw new StingException("Unable to open file " + RefseqFileName, e); } - logger.info("Using RefSeq annotations from "+RefseqFileName); + logger.info("Using RefSeq annotations from " + RefseqFileName); } if ( refseqIterator == null ) logger.info("No annotations available"); - Set hInfo = new HashSet(); + Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.add(new VCFHeaderLine("source", "IndelAnnotator")); hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName())); + HashSet anno = new HashSet(); - anno.add(new VCFInfoHeaderLine("type",1, VCFHeaderLineType.String,"Genomic interpretation (according to RefSeq)")); + anno.add(new VCFInfoHeaderLine("cDNAchange", 1, VCFHeaderLineType.String, "cDNAchange")); + anno.add(new VCFInfoHeaderLine("classification", 1, VCFHeaderLineType.String, "classification")); + anno.add(new VCFInfoHeaderLine("codonchange", 1, VCFHeaderLineType.String, "codonchange")); + anno.add(new VCFInfoHeaderLine("gene", 1, VCFHeaderLineType.String, "gene")); + anno.add(new VCFInfoHeaderLine("genomechange", 1, VCFHeaderLineType.String, "genomechange")); + anno.add(new VCFInfoHeaderLine("proteinchange", 1, VCFHeaderLineType.String, "proteinchange")); + anno.add(new VCFInfoHeaderLine("strand", 1, VCFHeaderLineType.String, "strand")); + anno.add(new VCFInfoHeaderLine("transcript", 1, VCFHeaderLineType.String, "transcript")); + anno.add(new VCFInfoHeaderLine("type", 1, VCFHeaderLineType.String, "type")); hInfo.addAll(anno); VCFHeader vcfHeader = new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit())); @@ -96,21 +75,118 @@ public class IndelAnnotator extends RodWalker{ return 0l; } + private TreeMap getAnnotationMap(VariantContext vc, VariantContext dbsnp, RODRecordList ann) { + TreeMap anns = new TreeMap(); + anns.put("gene", "---"); + anns.put("type", "IGR"); + anns.put("transcript", "---"); + anns.put("strand", "+"); + anns.put("proteinchange", "---"); + anns.put("genomechange", "---"); + anns.put("codonchange", "---"); + anns.put("cDNAchange", "---"); + + if (dbsnp != null) { + anns.put("ID", dbsnp.getAttribute("ID")); + } + + if (vc.isIndel()) { + anns.put("classification", vc.isInsertion() ? "INS" : "DEL"); + } + + if ( ann != null ) { + TreeMap> deleteriousnessRankedAnnotations = new TreeMap>(); + + for (int transcriptIndex = 0; transcriptIndex < ann.size(); transcriptIndex++) { + Transcript t = (Transcript) ann.get(transcriptIndex).getUnderlyingObject(); + TreeMap plausibleAnnotations = new TreeMap(); + Integer rank = 0; + + plausibleAnnotations.put("gene", t.getGeneName()); + plausibleAnnotations.put("transcript", t.getTranscriptId()); + plausibleAnnotations.put("strand", t.getStrand() == -1 ? "-" : "+"); + + if ( RefSeqFeature.isExon(ann) ) { + if ( RefSeqFeature.isCodingExon(ann) ) { + //b.append(annCoding); // both exon and coding = coding exon sequence + if (vc.getIndelLengths().get(0) % 3 == 0) { + plausibleAnnotations.put("type", "Non-frameshift"); + rank = 4; + } else { + plausibleAnnotations.put("type", "Frameshift"); + rank = 0; + } + } else { + //b.append(annUTR); // exon but not coding = UTR + if (t.getStrand() == 1) { + plausibleAnnotations.put("type", "5'-UTR"); + rank = 2; + } else { + plausibleAnnotations.put("type", "3'-UTR"); + rank = 3; + } + } + } else { + if ( RefSeqFeature.isCoding(ann) ) { + //b.append(annIntron); // not in exon, but within the coding region = intron + GenomeLoc ig = GenomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd()); + GenomeLoc cl = t.getCodingLocation(); + GenomeLoc g = t.getLocation(); + + boolean spliceSiteDisruption = false; + + for (GenomeLoc exon : t.getExons()) { + GenomeLoc expandedExon = GenomeLocParser.createGenomeLoc(exon.getContig(), exon.getStart() - 6, exon.getStop() + 6); + + if (ig.overlapsP(expandedExon)) { + spliceSiteDisruption = true; + } + } + + if (spliceSiteDisruption) { + plausibleAnnotations.put("type", "SpliceSiteDisruption"); + rank = 1; + } else { + plausibleAnnotations.put("type", "Intron"); + rank = 5; + } + } else { + //b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... + plausibleAnnotations.put("type", "Unknown"); + rank = 6; + } + } + + deleteriousnessRankedAnnotations.put(rank, plausibleAnnotations); + } + + anns.putAll(deleteriousnessRankedAnnotations.firstEntry().getValue()); + } + + + return anns; + } + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext con) { - if ( tracker == null ) - return 0; + if ( tracker == null ) { return 0; } VariantContext vc = tracker.getVariantContext(ref, "variant", null, con.getLocation(), true); - // ignore places where we don't have a variant - if ( vc == null ) - return 0; + if ( vc == null ) { return 0; } - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(ref.getLocus())); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - annotationString = annotationString.split("\\s+")[0]; + Collection dbsnps = tracker.getVariantContexts(ref, "dbsnp", null, con.getLocation(), true, true); + VariantContext dbsnp = null; + + if (dbsnps != null && dbsnps.size() > 0) { + ArrayList dbsnpsarray = new ArrayList(dbsnps); + dbsnp = dbsnpsarray.get(0); + } + + RODRecordList annotationList = refseqIterator.seekForward(ref.getLocus()); + TreeMap annotationMap = getAnnotationMap(vc, dbsnp, annotationList); Map attrs = new HashMap(vc.getAttributes()); - attrs.put("type",annotationString); + attrs.putAll(annotationMap); + vc = VariantContextUtils.modifyAttributes(vc, attrs); vcfWriter.add(vc, ref.getBase()); @@ -120,9 +196,4 @@ public class IndelAnnotator extends RodWalker{ public Long reduce(Integer i, Long j) { return i + j; } - - public void onTraversalDone(Long l) { - return; - } - } \ No newline at end of file