From 7f7d7eb2d18c76ddc4b97e047e6fa9513db44e97 Mon Sep 17 00:00:00 2001 From: asivache Date: Mon, 14 Feb 2011 17:36:39 +0000 Subject: [PATCH] Inconsequential changes, more 'variant classification' values are recognized git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5236 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/VariantContextAdaptors.java | 40 ++- .../gatk/features/maf/MafCodec.java | 272 ++++++++++++------ .../gatk/features/maf/MafFeature.java | 53 ++++ 3 files changed, 272 insertions(+), 93 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 5a8737b73..e6dc319d1 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -337,7 +337,45 @@ public class VariantContextAdaptors { addGenotype(genotypes,tumorSample,maf.getObservedTumorAlleleList(),maf.getRefBases()); - HashMap attrs = new HashMap(1); + HashMap attrs = new HashMap(10); + // fill attributes: + if ( maf.getHugoGeneSymbol() != null && ! maf.getHugoGeneSymbol().equals("Unknown")) + attrs.put("Gene",maf.getHugoGeneSymbol()); + + if ( maf.isSomatic() ) { + attrs.put(VCFConstants.SOMATIC_KEY,true); + attrs.put("SS","Somatic"); + } else { + attrs.put("SS","Germline"); + } + + if ( maf.getVariantClassification() != null ) { + switch(maf.getVariantClassification()) { + case Intergenic: attrs.put("VC","Genomic"); break; + case Intron: attrs.put("VC","Intron"); break; + case Noncoding_transcript: attrs.put("VC","Noncoding_transcript"); break; + case UTR3: attrs.put("VC","3'UTR"); break; + case UTR5: attrs.put("VC","5'UTR"); break; + case Flank5: attrs.put("VC","5'flank"); break; + case Promoter: attrs.put("VC","5'flank"); break; + case De_novo_start: attrs.put("VC","De_novo_start"); break; + case Silent: attrs.put("VC","Silent"); break; + case Missense: attrs.put("VC","Missense"); break; + case Nonsense: attrs.put("VC","Nonsense"); break; + case Splice: attrs.put("VC","Splice_site"); break; + case miRNA: attrs.put("VC","miRNA"); break; + case Frameshift: attrs.put("VC","Frameshift"); break; + case Inframe: attrs.put("VC","Inframe"); break; + case Stop_deletion: attrs.put("VC","Stop_codon_deletion"); + case Splice_site_deletion: attrs.put("VC","Splice_site_deletion"); + case Splice_site_insertion: attrs.put("VC","Splice_site_insertion"); + case Unclassified: attrs.put("VC","Unclassified"); + default: + } + } + + attrs.put("VT",maf.getType()); + // attrs.put(VariantContext.ID_KEY, hapmap.getName()); int end = maf.getEnd(); VariantContext vc = new VariantContext(name, maf.getChr(), maf.getStart(), end, alleles, diff --git a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java index b01baef58..594db71a8 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafCodec.java @@ -27,13 +27,18 @@ package org.broadinstitute.sting.playground.gatk.features.maf; import org.broad.tribble.FeatureCodec; import org.broad.tribble.Feature; +import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.StingException; import java.io.IOException; import java.util.Map; import java.util.HashMap; +import java.util.List; +import java.util.ArrayList; +import java.lang.reflect.Field; /** * Created by IntelliJ IDEA. @@ -47,31 +52,21 @@ public class MafCodec implements FeatureCodec { private int expectedTokenCount = -1; - private int BUILD_COL; - private int CHR_COL; - private int START_COL; - private int END_COL; - private int REF_ALLELE_COL; - private int TUMOR_ALLELE1_COL; - private int TUMOR_ALLELE2_COL; - private int TUMOR_SAMPLE_COL; - private int NORMAL_SAMPLE_COL; - // optional fields (absent from maf lite): - private int VARTYPE_COL = -1; - private int STRAND_COL = -1; - private static String BUILD_COLNAME="NCBI_Build"; - private static String CHR_COLNAME="Chromosome"; - private static String START_COLNAME="Start_position"; - private static String END_COLNAME="End_position"; - private static String REF_ALLELE_COLNAME="Reference_Allele"; - private static String TUMOR_ALLELE1_COLNAME="Tumor_Seq_Allele1"; - private static String TUMOR_ALLELE2_COLNAME="Tumor_Seq_Allele2"; - private static String TUMOR_SAMPLE_COLNAME="Tumor_Sample_Barcode"; - private static String NORMAL_SAMPLE_COLNAME="Matched_Norm_Sample_Barcode"; + private Column BUILD_COL = new Column("NCBI_Build",true); + private Column CHR_COL = new Column("Chromosome",true); + private Column START_COL = new Column("Start_position",true); + private Column END_COL = new Column("End_position",true); + private Column REF_ALLELE_COL = new Column("Reference_Allele",true); + private Column TUMOR_ALLELE1_COL = new Column("Tumor_Seq_Allele1",true); + private Column TUMOR_ALLELE2_COL = new Column("Tumor_Seq_Allele2",true); + private Column TUMOR_SAMPLE_COL = new Column("Tumor_Sample_Barcode",true); + private Column NORMAL_SAMPLE_COL = new Column("Matched_Norm_Sample_Barcode",true); // optional fields (absent from maf lite): - private static String VARTYPE_COLNAME="Variant_Type"; - private static String STRAND_COLNAME="Strand"; + private Column VARTYPE_COL = new Column("Variant_Type",false); + private Column STRAND_COL = new Column("Strand",false); + private Column HUGO_GENE_COL = new Column("Hugo_Symbol",false); + private Column VARCLASS_COL = new Column("Variant_Classification",false); public enum MAF_TYPE { @@ -85,18 +80,55 @@ public class MafCodec implements FeatureCodec { private MAF_TYPE mafType=MAF_TYPE.UNKNOWN; + private List allColumns = null; /// filled dynamically by constructor through introspection. Slow but less typing. + + private boolean tooManyColsWarned = false; + private boolean tooFewColsWarned = false; + + public MafCodec() { + allColumns = new ArrayList(30); + Field[] fields = this.getClass().getDeclaredFields(); + try { + for ( Field f : fields ) { + if ( f.get(this) instanceof Column ) { + allColumns.add((Column)f.get(this)); + } + } + } catch (IllegalAccessException e) { + throw new StingException("Error in MAFCodec when trying to introspect itself, this is probably a BUG",e); + } + } + + /** * Decode a line to obtain just its FeatureLoc for indexing -- contig, start, and stop. - * + * This method will NOT fill in the additional information available in the maf file * @param line the input line to decode * @return Return the FeatureLoc encoded by the line, or null if the line does not represent a feature (e.g. is * a comment) */ public Feature decodeLoc(String line) { - return decode(line); + return reallyDecode(line,false); } + + /** + * Fully decode a line, will try extracting as much additional/annotation information from the maf file as it can. + * @param line the input line to decode + * @return Return the FeatureLoc encoded by the line, or null if the line does not represent a feature (e.g. is + * a comment) + */ public Feature decode(String line) { + return reallyDecode(line,true); + } + + /** Decodes a maf line. If extra is false, will decode only location and return; + * if extra is true, then extracts everything it can (samples, annotations, etc) + * @param line + * @param extra + * @return + */ + public Feature reallyDecode(String line, boolean extra) { // ignore commented-out lines if (line.startsWith("#")) return null; @@ -137,24 +169,39 @@ public class MafCodec implements FeatureCodec { if (tokens.length < expectedTokenCount) { - log.error("MAF line contains too few columns ("+tokens.length+")"); - return null; + if ( ! tooFewColsWarned ) { + log.error("MAF line contains too few columns ("+tokens.length+"); this error is reported only once."); + tooFewColsWarned = true; + } } if (tokens.length > expectedTokenCount) { - log.warn("MAF line contains more columns than expected ("+tokens.length+"); extra columns discarded"); + if ( ! tooManyColsWarned ) { + log.warn("MAF line contains more columns than expected ("+tokens.length+"); extra columns discarded. This error is shown only once."); + tooManyColsWarned = true; + } } - if ( tokens[CHR_COL].equals("Chromosome") ) return null; // if someone uses this codec manually and feeds it the header line multiple times... + if ( tokens[CHR_COL.getIndex()].equals("Chromosome") ) return null; // if someone uses this codec manually and feeds it the header line multiple times... // create a new feature from the line: - int start = Integer.valueOf(tokens[START_COL]); - int stop = Integer.valueOf(tokens[END_COL]); + int start = 0; + try { + start = Integer.parseInt(START_COL.getValue(tokens)); + } catch (NumberFormatException e) { + throw new UserException.MalformedFile("Missing or non-numeric start position in line:\n"+line,e); + } + int stop = 0 ; + try { + stop = Integer.parseInt(END_COL.getValue(tokens)); + } catch (NumberFormatException e) { + throw new UserException.MalformedFile("Missing or non-numeric stop position in line:\n"+line,e); + } String eventType="UNKNOWN"; - String ref = tokens[REF_ALLELE_COL]; - String alt1 = tokens[TUMOR_ALLELE1_COL]; - String alt2 = tokens[TUMOR_ALLELE2_COL]; + String ref = REF_ALLELE_COL.getValue(tokens); + String alt1 = TUMOR_ALLELE1_COL.getValue(tokens); + String alt2 = TUMOR_ALLELE2_COL.getValue(tokens); if ( ref.equals("-") ) { // insertion @@ -208,16 +255,29 @@ public class MafCodec implements FeatureCodec { } } // if we got vartype column, make sure it makes sense: - if ( VARTYPE_COL != -1 && ! tokens[VARTYPE_COL].equals(eventType) ) - throw new UserException.MalformedFile("Inconsistency in MAF: variant looks like a "+eventType +" but annotated as "+ - tokens[VARTYPE_COL]); + if ( VARTYPE_COL.isSet(tokens) && ! tokens[VARTYPE_COL.getIndex()].equals(eventType) ) { + // special case: we annotate everything as MNP while MAF can have DNP/TNP, these are fine: + if ( eventType == MNP && ( + tokens[VARTYPE_COL.getIndex()].equals("DNP") && ref.length() == 2 || + tokens[VARTYPE_COL.getIndex()].equals("TNP") && ref.length() == 3) + ) {} // these are fine + else { + throw new UserException.MalformedFile("Inconsistency in MAF: variant looks like a "+eventType +" but annotated as "+ + tokens[VARTYPE_COL.getIndex()]); + } + } + MafFeature feature = new MafFeature(CHR_COL.getValue(tokens),start,stop); - MafFeature feature = new MafFeature(tokens[CHR_COL],start,stop); + if ( ! extra ) return feature; // ignore additional fields unless we were explicitly asked to read those! + feature.setVariantType(eventType); feature.setRefAllele(ref); feature.setObservedTumor(alt1,alt2); - feature.setTumorSample(tokens[TUMOR_SAMPLE_COL]); - feature.setNormalSample(tokens[NORMAL_SAMPLE_COL]); + feature.setTumorSample(TUMOR_SAMPLE_COL.getValue(tokens)); + feature.setNormalSample(NORMAL_SAMPLE_COL.getValue(tokens)); + + if ( HUGO_GENE_COL.isSet(tokens) ) feature.setHugoGeneSymbol(tokens[HUGO_GENE_COL.getIndex()]); + if ( VARCLASS_COL.isSet(tokens) ) feature.setVariantClassification(tokens[VARCLASS_COL.getIndex()]); return feature; } @@ -239,67 +299,95 @@ public class MafCodec implements FeatureCodec { * */ private void setMafLiteCols() { - BUILD_COL = 0; - CHR_COL = 1; - START_COL = 2; - END_COL = 3; - REF_ALLELE_COL = 4; - TUMOR_ALLELE1_COL = 5; - TUMOR_ALLELE2_COL = 6; - TUMOR_SAMPLE_COL = 7; - NORMAL_SAMPLE_COL = 8; + BUILD_COL.setIndex(0); + CHR_COL.setIndex(1); + START_COL.setIndex(2); + END_COL.setIndex(3); + REF_ALLELE_COL.setIndex(4); + TUMOR_ALLELE1_COL.setIndex(5); + TUMOR_ALLELE2_COL.setIndex(6); + TUMOR_SAMPLE_COL.setIndex(7); + NORMAL_SAMPLE_COL.setIndex(8); } private void setMafAnnotatedCols() { - BUILD_COL = 3; - CHR_COL = 4; - START_COL = 5; - END_COL = 6; - REF_ALLELE_COL = 10; - TUMOR_ALLELE1_COL = 11; - TUMOR_ALLELE2_COL = 12; - TUMOR_SAMPLE_COL = 15; - NORMAL_SAMPLE_COL = 16; - VARTYPE_COL = 9; - STRAND_COL = 7; + BUILD_COL.setIndex(3); + CHR_COL.setIndex(4); + START_COL.setIndex(5); + END_COL.setIndex(6); + REF_ALLELE_COL.setIndex(10); + TUMOR_ALLELE1_COL.setIndex(11); + TUMOR_ALLELE2_COL.setIndex(12); + TUMOR_SAMPLE_COL.setIndex(15); + NORMAL_SAMPLE_COL.setIndex(16); + VARTYPE_COL.setIndex(9); + STRAND_COL.setIndex(7); + VARCLASS_COL.setIndex(8); + HUGO_GENE_COL.setIndex(0); } private void setColumnsFromHeader(String[] tokens) { Map colNames = new HashMap(); for ( int i = 0 ; i < tokens.length ; i++ ) colNames.put(tokens[i],i); - if ( colNames.containsKey(BUILD_COLNAME) ) BUILD_COL = colNames.get(BUILD_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+BUILD_COLNAME+" column"); - - if ( colNames.containsKey(CHR_COLNAME) ) CHR_COL = colNames.get(CHR_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+CHR_COLNAME+" column"); - - if ( colNames.containsKey(START_COLNAME) ) START_COL = colNames.get(START_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+START_COLNAME+" column"); - - if ( colNames.containsKey(END_COLNAME) ) END_COL = colNames.get(END_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+END_COLNAME+" column"); - - if ( colNames.containsKey(REF_ALLELE_COLNAME) ) REF_ALLELE_COL = colNames.get(REF_ALLELE_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+REF_ALLELE_COLNAME+" column"); - - if ( colNames.containsKey(TUMOR_ALLELE1_COLNAME) ) TUMOR_ALLELE1_COL = colNames.get(TUMOR_ALLELE1_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+TUMOR_ALLELE1_COLNAME+" column"); - - if ( colNames.containsKey(TUMOR_ALLELE2_COLNAME) ) TUMOR_ALLELE2_COL = colNames.get(TUMOR_ALLELE2_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+TUMOR_ALLELE2_COLNAME+" column"); - - if ( colNames.containsKey(TUMOR_SAMPLE_COLNAME) ) TUMOR_SAMPLE_COL = colNames.get(TUMOR_SAMPLE_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+TUMOR_SAMPLE_COLNAME+" column"); - - if ( colNames.containsKey(NORMAL_SAMPLE_COLNAME) ) NORMAL_SAMPLE_COL = colNames.get(NORMAL_SAMPLE_COLNAME); - else throw new UserException.MalformedFile("Maf file does not have "+NORMAL_SAMPLE_COLNAME+" column"); - - // we do not require variant type column but we use it if it's present (for validation): - if ( colNames.containsKey(VARTYPE_COLNAME) ) VARTYPE_COL = colNames.get(VARTYPE_COLNAME); - - // we do not require strand column but we use it if it's present (for validation): - if ( colNames.containsKey(STRAND_COLNAME) ) STRAND_COL = colNames.get(STRAND_COLNAME); + for ( Column c : allColumns ) c.setFromMap(colNames); } + } + + +class Column { + int index ; + String name; + boolean required; + + Column(String name, boolean required) { + this.name = name; + this.required = required; + this.index = -1; + } + + public String getName() { return name; } + public void setName(String name) { this.name = name; } + public int getIndex() { return index; } + public void setIndex(int index) { this.index = index; } + public String getValue(String[] fields) { + if ( index < fields.length ) return fields[index]; + + if ( required ) throw new UserException.MalformedFile("In MAF file: required column "+name+" has index "+index+ + ", but only "+fields.length+ " fields are present in maf line"); + return null; + } + + /** Sets this column's index from the provided name->index map (i.e. searches for itself in the map). + * If column not found, throw_exception is true AND this column is required, then an exception will + * be thrown right away; otherwise returns quietely even if map does not contain this column. + * @param m + * @param throw_exception + */ + public void setFromMap(Map m, boolean throw_exception) { + Integer i = m.get(this.name); + if ( i == null ) { + if ( this.required && throw_exception ) throw new UserException.MalformedFile("Required column "+this.name+" is missing from the maf file"); + index = -1; + return; // not found + } + this.index = i.intValue(); // found and set. + } + +/** Sets this column's index from the provided name->index map (i.e. searches for itself in the map). + * If this column is required but not found in the map, then an exception will + * be thrown. + * @param m + */ + public void setFromMap(Map m) { + setFromMap(m,true); + } + + public boolean isSet() { return index > -1; } + + public boolean isSet(String[] fields) { return index > -1 && index < fields.length; } + +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java index 9a2c8a0bb..8fd9889b6 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/features/maf/MafFeature.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.playground.gatk.features.maf; import org.broad.tribble.Feature; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.*; @@ -47,11 +48,18 @@ public class MafFeature implements Feature { private String[] observedNormAlleles = null; // The sequences of the observed alleles in normal private String tumorSampleId = null; private String normalSampleId = null; + private String hugoSymbol = null; + private Classification classification = null; public enum Type { UNKNOWN,SNP,MNP,INS,DEL }; + public enum Classification { + Unclassified, Intergenic,Intron,Noncoding_transcript,UTR3,UTR5,Flank5,Silent,Missense, Nonsense, Splice, miRNA, + Frameshift, Inframe, Stop_deletion, Promoter,De_novo_start,Splice_site_deletion,Splice_site_insertion + } + private Type type = Type.UNKNOWN; /** @@ -99,6 +107,14 @@ public class MafFeature implements Feature { return refAllele; } + public String getHugoGeneSymbol() { + return hugoSymbol; + } + + public String setHugoGeneSymbol(String genename) { + return hugoSymbol = genename; + } + /** * Returns list of alleles (represented as strings) observed in Tumor. Returned alleles * could be redundant (e.g. if we have homozygous non-ref at ploidy 2+). @@ -187,6 +203,43 @@ public class MafFeature implements Feature { } } + public boolean isSomatic() { + if ( observedTumAlleles[0].equals(refAllele) && observedTumAlleles[1].equals(refAllele) ) return false; // tumor is ref + // we get here only if tumor is non-ref + if ( observedNormAlleles == null ) return true; // norm alleles are omitted from maf only if they are all ref + if ( observedNormAlleles[0].equals(refAllele) && observedNormAlleles[1].equals(refAllele) ) return true; + return false; + } + + public void setVariantClassification(String s) { + if ( s.equals("IGR") ) { classification = Classification.Intergenic ; return; } + if ( s.equals("Intron") ) { classification = Classification.Intron ; return; } + if ( s.equals("3'UTR") ) { classification = Classification.UTR3 ; return; } + if ( s.equals("5'UTR") ) { classification = Classification.UTR5 ; return; } + if ( s.equals("5'-Flank") ) { classification = Classification.Flank5 ; return; } + if ( s.equals("Silent") ) { classification = Classification.Silent ; return; } + if ( s.equals("Non-coding_Transcript")) { classification = Classification.Noncoding_transcript; return; } + if ( s.equals("Missense") || s.equals("Missense_Mutation") ) { classification = Classification.Missense ; return; } + if ( s.equals("Nonsense_Mutation") ) { classification = Classification.Nonsense ; return; } + if ( s.equals("Splice_Site") ) { classification = Classification.Splice ; return; } + if ( s.equals("miRNA") ) { classification = Classification.miRNA ; return; } + if ( s.equals("Frame_Shift_Ins") ) { classification = Classification.Frameshift ; return; } + if ( s.equals("Frame_Shift_Del") ) { classification = Classification.Frameshift ; return; } + if ( s.equals("In_Frame_Ins") ) { classification = Classification.Inframe ; return; } + if ( s.equals("In_Frame_Del") ) { classification = Classification.Inframe ; return; } + if ( s.equals("Stop_Codon_Del") ) { classification = Classification.Stop_deletion ; return; } + if ( s.equals("Splice_Site_Del") ) { classification = Classification.Splice_site_deletion ; return; } + if ( s.equals("Splice_Site_Ins") ) { classification = Classification.Splice_site_insertion ; return; } + if ( s.equals("Promoter") ) { classification = Classification.Promoter ; return; } + if ( s.equals("De_novo_Start") ) { classification = Classification.De_novo_start ; return; } + if ( s.equals("TX-REF-MISMATCH") ) { classification = Classification.Unclassified ; return; } + throw new UserException.MalformedFile("Unknown variant classification: " + s); + } + + public Classification getVariantClassification() { + return classification; + } + /* * the required getting and setter methods */