From abf3fcbb72cf7c8f8238f4dac77c7fb6b910874c Mon Sep 17 00:00:00 2001 From: asivache Date: Thu, 24 Feb 2011 16:19:06 +0000 Subject: [PATCH] Little changes in recognized annotation terms; columns in annotated maf are now prioritized and multiple alternatives do not cause 'i don't know what to do' crash: e.g. if Chromosome and chr columns are both present, then Chromosome is taken (has a priority). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5302 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/VariantContextAdaptors.java | 1 + .../gatk/features/maf/MafCodec.java | 80 ++++++++++++------- .../gatk/features/maf/MafFeature.java | 20 +++-- 3 files changed, 64 insertions(+), 37 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index c2380558a..3aa81a551 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -359,6 +359,7 @@ public class VariantContextAdaptors { case Flank5: attrs.put("VC","5'flank"); break; case Promoter: attrs.put("VC","5'flank"); break; case De_novo_start: attrs.put("VC","De_novo_start"); break; + case De_novo_start_out_of_frame: attrs.put("VC","De_novo_start_out_of_frame"); break; case Silent: attrs.put("VC","Silent"); break; case Missense: attrs.put("VC","Missense"); break; case Nonsense: attrs.put("VC","Nonsense"); break; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java index 594db71a8..54645aef4 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java @@ -34,10 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.StingException; import java.io.IOException; -import java.util.Map; -import java.util.HashMap; -import java.util.List; -import java.util.ArrayList; +import java.util.*; import java.lang.reflect.Field; /** @@ -53,20 +50,20 @@ public class MafCodec implements FeatureCodec { private int expectedTokenCount = -1; - private Column BUILD_COL = new Column("NCBI_Build",true); - private Column CHR_COL = new Column("Chromosome",true); - private Column START_COL = new Column("Start_position",true); - private Column END_COL = new Column("End_position",true); - private Column REF_ALLELE_COL = new Column("Reference_Allele",true); - private Column TUMOR_ALLELE1_COL = new Column("Tumor_Seq_Allele1",true); - private Column TUMOR_ALLELE2_COL = new Column("Tumor_Seq_Allele2",true); - private Column TUMOR_SAMPLE_COL = new Column("Tumor_Sample_Barcode",true); - private Column NORMAL_SAMPLE_COL = new Column("Matched_Norm_Sample_Barcode",true); + private Column BUILD_COL = new Column(new String[]{"NCBI_Build","build"},true); + private Column CHR_COL = new Column(new String[] {"Chromosome","chr"},true); + private Column START_COL = new Column(new String[] {"Start_position","start"},true); + private Column END_COL = new Column(new String[]{"End_position","end"},true); + private Column REF_ALLELE_COL = new Column(new String[] {"Reference_Allele","ref_allele"},true); + private Column TUMOR_ALLELE1_COL = new Column(new String[] {"Tumor_Seq_Allele1","tum_allele1"},true); + private Column TUMOR_ALLELE2_COL = new Column(new String[] {"Tumor_Seq_Allele2","tum_allele2"},true); + private Column TUMOR_SAMPLE_COL = new Column(new String[] {"Tumor_Sample_Barcode","tumor_barcode"},true); + private Column NORMAL_SAMPLE_COL = new Column(new String[]{"Matched_Norm_Sample_Barcode","normal_barcode"},true); // optional fields (absent from maf lite): - private Column VARTYPE_COL = new Column("Variant_Type",false); - private Column STRAND_COL = new Column("Strand",false); - private Column HUGO_GENE_COL = new Column("Hugo_Symbol",false); - private Column VARCLASS_COL = new Column("Variant_Classification",false); + private Column VARTYPE_COL = new Column(new String[]{"Variant_Type","classification"},false); + private Column STRAND_COL = new Column(new String[]{"Strand","strand"},false); + private Column HUGO_GENE_COL = new Column(new String[]{"Hugo_Symbol","gene"},false); + private Column VARCLASS_COL = new Column(new String[]{"Variant_Classification","type"},false); public enum MAF_TYPE { @@ -151,7 +148,8 @@ public class MafCodec implements FeatureCodec { log.info("MAF file has "+expectedTokenCount +" columns in first line, unknown file type"); } } - if ( line.contains("Chromosome") && line.contains("Start") && line.contains("Build")) { + if ( line.contains("Chromosome") && line.contains("Start") && line.contains("Build") || + line.contains("build") && line.contains("start") && line.contains("ref_allele") ) { // a naive way to detect the line with column names setColumnsFromHeader(tokens); @@ -181,7 +179,7 @@ public class MafCodec implements FeatureCodec { } } - if ( tokens[CHR_COL.getIndex()].equals("Chromosome") ) return null; // if someone uses this codec manually and feeds it the header line multiple times... + if ( tokens[CHR_COL.getIndex()].equals("Chromosome") || tokens[CHR_COL.getIndex()].equals("chr")) return null; // if someone uses this codec manually and feeds it the header line multiple times... // create a new feature from the line: int start = 0; @@ -339,23 +337,38 @@ public class MafCodec implements FeatureCodec { class Column { int index ; - String name; + List names; boolean required; Column(String name, boolean required) { - this.name = name; + this.names = new ArrayList(); + this.names.add(name); this.required = required; this.index = -1; } - public String getName() { return name; } - public void setName(String name) { this.name = name; } + Column(String [] names, boolean required) { + this.names = new ArrayList(); + for ( int i = 0 ; i < names.length ; i++ ) this.names.add(names[i]); + this.required = required; + this.index = -1; + } + + public String getName() { return names.get(0); } + public Collection getNames() { return names; } + public void setName(String name) { + for ( String n : names ) { + if ( n.equals(name) ) return; + } + this.names.add( name ); + } + public int getIndex() { return index; } public void setIndex(int index) { this.index = index; } public String getValue(String[] fields) { if ( index < fields.length ) return fields[index]; - if ( required ) throw new UserException.MalformedFile("In MAF file: required column "+name+" has index "+index+ + if ( required ) throw new UserException.MalformedFile("In MAF file: required column "+getName()+" has index "+index+ ", but only "+fields.length+ " fields are present in maf line"); return null; } @@ -367,13 +380,20 @@ class Column { * @param throw_exception */ public void setFromMap(Map m, boolean throw_exception) { - Integer i = m.get(this.name); - if ( i == null ) { - if ( this.required && throw_exception ) throw new UserException.MalformedFile("Required column "+this.name+" is missing from the maf file"); - index = -1; - return; // not found +// Integer i = null; + for ( String n : names ) { + if ( m.containsKey(n) ) { +// if ( i != null ) +// throw new UserException.MalformedFile("MAF file contains multiple columns with name or alternative names registered for single data field "+getName()); + // go with the first column name found; we assume here that column names have priorities: + // for instance, if the file has both 'Chromosome' and 'chr' columns, we will just take + // Chromosome and run with that + this.index = m.get(n); + return; + } } - this.index = i.intValue(); // found and set. + if ( this.required && throw_exception ) throw new UserException.MalformedFile("Required column "+getName()+" is missing from the maf file"); + this.index = -1; } /** Sets this column's index from the provided name->index map (i.e. searches for itself in the map). diff --git a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java index 7b561f8c5..dd64e9176 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java @@ -57,7 +57,7 @@ public class MafFeature implements Feature { public enum Classification { Unclassified, Intergenic,Intron,Noncoding_transcript,UTR3,UTR5,Flank5,Silent,Missense, Nonsense, Splice_site, miRNA, - Frameshift, Inframe, Stop_deletion, Promoter,De_novo_start + Frameshift, Inframe, Stop_deletion, Promoter,De_novo_start, De_novo_start_out_of_frame } private Type type = Type.UNKNOWN; @@ -111,8 +111,13 @@ public class MafFeature implements Feature { return hugoSymbol; } - public String setHugoGeneSymbol(String genename) { - return hugoSymbol = genename; + public void setHugoGeneSymbol(String genename) { + int pos = genename.indexOf('|'); + if ( pos < 0 ) { + hugoSymbol = genename; + } else { + hugoSymbol = genename.substring(0,pos); + } } /** @@ -214,13 +219,13 @@ public class MafFeature implements Feature { public void setVariantClassification(String s) { if ( s.equals("IGR") ) { classification = Classification.Intergenic ; return; } if ( s.equals("Intron") ) { classification = Classification.Intron ; return; } - if ( s.equals("3'UTR") ) { classification = Classification.UTR3 ; return; } - if ( s.equals("5'UTR") ) { classification = Classification.UTR5 ; return; } + if ( s.equals("3'UTR") || s.equals("3'-UTR")) { classification = Classification.UTR3 ; return; } + if ( s.equals("5'UTR") || s.equals("5'-UTR")) { classification = Classification.UTR5 ; return; } if ( s.equals("5'-Flank") ) { classification = Classification.Flank5 ; return; } - if ( s.equals("Silent") ) { classification = Classification.Silent ; return; } + if ( s.equals("Silent") || s.equals("Synonymous")) { classification = Classification.Silent ; return; } if ( s.equals("Non-coding_Transcript")) { classification = Classification.Noncoding_transcript; return; } if ( s.equals("Missense") || s.equals("Missense_Mutation") ) { classification = Classification.Missense ; return; } - if ( s.equals("Nonsense_Mutation") ) { classification = Classification.Nonsense ; return; } + if ( s.equals("Nonsense_Mutation") || s.equals("Nonsense") ) { classification = Classification.Nonsense ; return; } if ( s.equals("Splice_Site") ) { classification = Classification.Splice_site ; return; } if ( s.equals("miRNA") ) { classification = Classification.miRNA ; return; } if ( s.equals("Frame_Shift_Ins") ) { classification = Classification.Frameshift ; return; } @@ -233,6 +238,7 @@ public class MafFeature implements Feature { if ( s.equals("Splice_Site_SNP") ) { classification = Classification.Splice_site ; return; } if ( s.equals("Promoter") ) { classification = Classification.Promoter ; return; } if ( s.equals("De_novo_Start") ) { classification = Classification.De_novo_start ; return; } + if ( s.equals("De_novo_Start_OutOfFrame") ) { classification = Classification.De_novo_start_out_of_frame ; return; } if ( s.equals("TX-REF-MISMATCH") ) { classification = Classification.Unclassified ; return; } throw new UserException.MalformedFile("Unknown variant classification: " + s); }